def test_update(self):
        meter = MetricLogger()
        for i in range(10):
            meter.update(metric=float(i))

        m = meter.meters["metric"]
        self.assertEqual(m.count, 10)
        self.assertEqual(m.total, 45)
        self.assertEqual(m.median, 4)
        self.assertEqual(m.avg, 4.5)
Exemple #2
0
    def __init__(self, model_s, model_t, data_loader, optimizer, scheduler,
                 ckpt_s, ckpt_t, checkpoint_period, cfg):
        super(MTtrainer, self).__init__()
        self.cfg = cfg
        self.logger = logging.getLogger("maskrcnn_benchmark.trainer")
        self.scheduler = scheduler
        self.meters = MetricLogger(delimiter="  ")
        self.max_iter = len(data_loader['source'])
        self.start_iter = 0

        self.student = model_s
        self.student_bs = cfg.MT.AUG_S
        self.teacher = model_t
        self.teacher_bs = cfg.MT.AUG_K
        # pdb.set_trace()
        self.device_s = torch.device('cuda:0')
        self.device_t = torch.device('cuda:0')
        self.checkpoint_period = checkpoint_period
        self.ckpt_s = ckpt_s
        self.ckpt_t = ckpt_t
        self.optimizer = optimizer
        # mt hyperparameter
        self.lambda_value = cfg.MT.LAMBDA
        self.alpha = cfg.MT.ALPHA
        self.alpha_rampup = cfg.MT.ALPHA_RAMPUP
        self.rampup_step = cfg.MT.RAMPUP_STEP
        self.rampdown_step = cfg.MT.RAMPDOWN_STEP
        self.start_mt = cfg.MT.START_MT
        #loss weight
        self.balanced_weight = {
            'mt_classifier': (cfg.MT.CLS_LOSS),
            'nms_loss': cfg.MODEL.RELATION_NMS.LOSS,
            'mt_fg_loss': cfg.MT.FG_HINT,
        }

        self.dataloader_s = data_loader['source']
        if self.cfg.DATASETS.NO_LABEL:
            self.dataloader_u = data_loader['no_label']
        if cfg.DATASETS.SYN:
            # todo: add in training
            self.dataloader_syn = data_loader['synthesis']
        self.n_step_unlabel = cfg.MT.N_STEP_UNLABEL
        self.weight_sum_loss = partial(weight_sum_losses,
                                       rampup_length=self.rampup_step,
                                       rampdown_length=self.rampdown_step,
                                       total_length=self.max_iter,
                                       l=self.lambda_value,
                                       balanced=self.balanced_weight,
                                       start_mt=self.start_mt)
 def test_no_attr(self):
     meter = MetricLogger()
     _ = meter.meters
     _ = meter.delimiter
     def broken():
         _ = meter.not_existent
     self.assertRaises(AttributeError, broken)
Exemple #4
0
 def __init__(self, DMM, encoder, decoder, args):
     super(Trainer, self).__init__()
     self.count = 0
     self.meters = MetricLogger(delimiter=" ")
     self.decoder = decoder
     self.encoder = encoder
     self.DMM = DMM
     if args.load_proposals and not args.load_proposals_dataset:
         self.pred_offline_meta = json.load(
             open(args.pred_offline_meta, 'r'))
         if 'vidfid2index' in self.pred_offline_meta:
             self.pred_offline_meta = self.pred_offline_meta['vidfid2index']
     mask_siou = softIoULoss()
     if args.use_gpu:
         self.encoder.to(args.device)
         if self.decoder is not None:
             self.decoder.to(args.device)
         self.DMM.to(args.device)
         mask_siou.to(args.device)
     if args.train_split == PHASE.TRAIN.value:
         json_file = cfg_youtube.FILES.DB_INFO_TRAIN
     elif args.train_split == PHASE.TRAINTESTDEVOT.value:
         json_file = cfg_youtube.FILES.DB_INFO_TRAINTESTDEVOT
     #elif args.train_split == PHASE.DAVISTRAIN.value:
     #    json_file = cfg_youtube.FILES.DB_INFO_TRAIN_DAVIS
     else:
         raise AttributeError('not support %s' % args.train_split)
     assert (os.path.exists(json_file)), json_file
     json_data = open(json_file)
     self.data = json.load(json_data)
     if args.local_rank == 0:
         logging.info('load from json_data; num vid %d' %
                      len(self.data['videos']))
     self.crits = mask_siou
Exemple #5
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    if cfg.USE_TENSORBOARD_LOGS:
        meters = TensorboardLogger(
            log_dir=os.path.join(output_dir, 'tensorboard_logs'),
            start_iter=arguments['iteration'],
            delimiter="  ",
        )
    else:
        meters = MetricLogger(delimiter="  ")

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        meters,
    )

    return model
Exemple #6
0
 def __init__(self, DMM, encoder, decoder, args, dmmcfgs):
     super(Evaler, self).__init__()
     self.meters = MetricLogger(delimiter=" ")
     self.decoder = decoder
     self.encoder = encoder
     self.DMM = DMM
     if args.load_proposals and not args.load_proposals_dataset:
         logging.info('load %s' % args.pred_offline_meta)
         self.pred_offline_meta = json.load(
             open(args.pred_offline_meta, 'r'))
         if 'vidfid2index' in self.pred_offline_meta:
             self.pred_offline_meta = self.pred_offline_meta['vidfid2index']
     if args.use_gpu:
         self.encoder.cuda()
         self.decoder.cuda()
         self.DMM.cuda()
     timestr = time.strftime('%m-%d')
     model_name = args.model_name.strip('/')
     model_id = model_name.split('/')[-2] + '/epo' + model_name.split(
         '/')[-1].split('epo')[-1]
     self.eval_output_root = '%s/eval/%s/%s/L%s_%d_s%s/' % (
         args.models_root, timestr, model_id, args.eval_flag,
         args.test_image_h, args.eval_split)
     self.eval_output_root = self.eval_output_root.replace('__', '_')
     timestr = time.strftime('%m-%d-%H-%M')
     save_config_dir = '%s/save_config/%s/' % (self.eval_output_root,
                                               timestr)
     isrank0 = args.local_rank == 0
     if isrank0:
         if not os.path.exists(save_config_dir):
             make_dir(save_config_dir)
         yaml.dump(
             args, open(os.path.join(save_config_dir, 'eval_args.yaml'),
                        'w'))
         yaml.dump(
             dmmcfgs,
             open(os.path.join(save_config_dir, 'eval_dmm_config.yaml'),
                  'w'))
     json_file = open(get_db_path(args.eval_split), 'r')
     self.seq_dir = get_img_path(args.eval_split)
     self.anno_dir = get_anno_path(args.eval_split)
     self.data = json.load(json_file)
     if isrank0: logging.info('num vid %d' % len(self.data['videos']))
     self.DMM.eval()
     self.encoder.eval()
     self.decoder.eval()
def evaluator(cfg,args,model,device,iteration):
    meters_val = MetricLogger(delimiter="  ")

    data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=False)[0]
        
    with torch.no_grad():
        # Should be one image for each GPU:
        print('Calculating evaluation loss.')
        for iteration_val, batch in enumerate(data_loader_val):
            #if is_main_process():
            #    print(iteration_val)
            if args.debug and iteration_val>10:
                break
            images_val, targets_val, _ = batch
            
            skip_batch=False
            nbox=[]
            for t in targets_val:
                nbox.append(len(t))
                if len(t)<1:
                    skip_batch=True
                    break
            if skip_batch:
                continue
            try:
                print(iteration_val,nbox)
                images_val = images_val.to(device)
                targets_val = [target.to(device) for target in targets_val]
                loss_dict = model(images_val, targets_val)
                losses = sum(loss for loss in loss_dict.values())
                loss_dict_reduced = reduce_loss_dict(loss_dict)
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            except:
                print('Warning: ground truth error.')
    
        #synchronize()

        if is_main_process():
            print('Save evaluation loss to tensorboard.')
            for name, meter in meters_val.meters.items():
                print(name,meter.global_avg)
                args.writer.add_scalar('EvalMetrics/'+name, meter.global_avg, iteration / args.iters_per_epoch)
            print('Pass')
Exemple #8
0
def do_val(model=None, data_loader_val=None, device=None):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    meters = MetricLogger(delimiter="  ")

    for images, targets, _ in data_loader_val:

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        with torch.no_grad():
            loss_dict = model(images, targets)

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

    logger.info(meters.delimiter.join(["Val {meters}"]).format(meters=meters.str_avg(),))

    return meters
Exemple #9
0
def finetune_first_image(model, images, targets, optimizer,scheduler, logger, cfg):
    total_iter_finetune = cfg.FINETUNE.TOTAL_ITER
    model.train()
    meters = MetricLogger(delimiter="  ")
    for iteration in range(total_iter_finetune):

        scheduler.step()
        loss_dict, _ = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(total_loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        meters.update(lr=optimizer.param_groups[0]["lr"])

        if iteration % (total_iter_finetune / 2) == 0 :
            logger.info(
                meters.delimiter.join(
                    [
                        "{meters}",
                    ]
                ).format(
                    meters=str(meters),
                )
            )

    model.eval()
    return model
Exemple #10
0
def validation(model, data_loader, device, logger, tensorboard_logger,
               iteration):
    logger.info('-' * 40)
    logger.info("Start Validation")
    meters = MetricLogger(delimiter="  ")
    start_validation_time = time.time()

    max_iter = len(data_loader)

    for idx, batch in enumerate(tqdm(data_loader)):
        images, targets, _ = batch
        images = images.to(device)
        targets = [target.to(device) for target in targets]

        with torch.no_grad():
            loss_dict, _ = model(images, targets)
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(total_loss=losses_reduced, **loss_dict_reduced)

    tensorboard_logger.write(meters, iteration, phase='Valid')
    logger.info('Validation:')
    logger.info(
        meters.delimiter.join([
            "iter: {iter}",
            "{meters}",
            "max mem: {memory:.0f}",
        ]).format(
            iter=iteration,
            meters=str(meters),
            memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
        ))

    total_validation_time = time.time() - start_validation_time
    total_time_str = str(datetime.timedelta(seconds=total_validation_time))
    logger.info("Total Validation time: {} ({:.4f} s / it)".format(
        total_time_str, total_validation_time / (max_iter)))
    logger.info('-' * 40)
Exemple #11
0
def do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    summary_writter = tensorboardX.SummaryWriter(log_dir=cfg.OUTPUT_DIR)
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    iou_types = ("bbox",)
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm",)
    if cfg.MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints",)
    dataset_names = cfg.DATASETS.TEST

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(
                f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for
                                                                                                   target in targets]}")
            continue
Exemple #12
0
    def run_eval(self):
        if isrank0:
            logging.info("Dataset is %s; len of loader %d" %
                         (self.dataset, len(self.loader)))
            logging.info("Split is %s" % (self.split))
        meters = MetricLogger(delimiter=" ")

        # loop over data loader
        start = time.time()
        # -------------------   forward model ------------------------------
        for batch_idx, (inputs, imgs_names, targets, seq_name,
                        starting_frame) in enumerate(self.loader):
            meters.update(dT=time.time() - start)
            if batch_idx % 5 == 0:
                logging.info('[{}] {}/{};{} '.format(
                    args.distributed_manully_rank, batch_idx, len(self.loader),
                    meters))
            targets = targets.cuda()  # use our collate function
            inputs = inputs.cuda()
            cur_device = inputs.device
            CHECK4D(targets)  # B, Len, O, HW
            CHECK5D(inputs)  # B Len D H W
            if args.load_proposals_dataset:
                proposals_cur_batch = imgs_names
                proposals = []
                for proposal_cur_vid in proposals_cur_batch:
                    boxlist = list(
                        proposal_cur_vid)  # BoxList of current batch
                    boxlist = [b.to(cur_device) for b in boxlist]
                    proposals.append(boxlist)  # BoxList of current batch
                imgs_names = None
            else:
                proposals = None
            with torch.no_grad():
                self.evaler(batch_idx, inputs, imgs_names, targets, seq_name,
                            args, proposals)
            meters.update(bT=time.time() - start)
            start = time.time()
Exemple #13
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    use_amp,
    cfg,
    dllogger,
    per_iter_end_callback_fn=None,
):
    dllogger.log(step="PARAMETER", data={"train_start": True})
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    if use_amp:
        scaler = torch.cuda.amp.GradScaler(init_scale=8192.0)
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        if use_amp:
            with torch.cuda.amp.autocast():
                loss_dict = model(images, targets)
        else:
            loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)


        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        if use_amp:        
            scaler.scale(losses).backward()
        else:
            losses.backward()

        def _take_step():
            if use_amp:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        if not cfg.SOLVER.ACCUMULATE_GRAD:
            _take_step()
        else:
            if (iteration + 1) % cfg.SOLVER.ACCUMULATE_STEPS == 0:
                for param in model.parameters():
                    if param.grad is not None:
                        param.grad.data.div_(cfg.SOLVER.ACCUMULATE_STEPS)
                _take_step()
            
        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            log_data = {"eta":eta_string, "learning_rate":optimizer.param_groups[0]["lr"],
                        "memory": torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 }
            log_data.update(meters.get_dict())
            dllogger.log(step=(iteration,), data=log_data)

        if cfg.SAVE_CHECKPOINT:
            if iteration % checkpoint_period == 0:
                checkpointer.save("model_{:07d}".format(iteration), **arguments)
            if iteration == max_iter:
                checkpointer.save("model_final", **arguments)

        # per-epoch work (testing)
        if per_iter_end_callback_fn is not None:
            early_exit = per_iter_end_callback_fn(iteration=iteration)
            if early_exit:
                break

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    dllogger.log(step=tuple(), data={"e2e_train_time": total_training_time,
                                                   "train_perf_fps": max_iter * cfg.SOLVER.IMS_PER_BATCH / total_training_time})
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info(
    "Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)
        )
    )
def do_train(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments, epoch_id, eval_in_train,
             eval_out_dir, eval_in_train_per_iter, iou_thresh_eval, min_loss,
             eval_aug_thickness):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info(f"Start training {epoch_id}")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    predictions_all = []
    losses_last = 100
    for iteration, batch in enumerate(data_loader, start_iter):
        fn = [os.path.basename(os.path.dirname(nm)) for nm in batch['fn']]
        if SHOW_FN:
            print(f'\t\t{fn}')

        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        batch['x'][1] = batch['x'][1].to(device)
        batch['y'] = [b.to(device) for b in batch['y']]

        loss_dict, predictions_i = model(batch['x'], batch['y'])

        if CHECK_NAN:
            any_nan = sum(torch.isnan(v.data) for v in loss_dict.values())
            if any_nan:
                print(f'\nGot nan loss:\n{fn}\n')
                import pdb
                pdb.set_trace()  # XXX BREAKPOINT
                continue

        losses = sum(loss for loss in loss_dict.values())

        if eval_in_train > 0 and epoch_id % eval_in_train == 0:
            data_id = batch['id']
            for k in range(len(data_id)):
                predictions_i[k].constants['data_id'] = data_id[k]

            predictions_i = [p.to(torch.device('cpu')) for p in predictions_i]
            [p.detach() for p in predictions_i]
            predictions_all += predictions_i

            if eval_in_train_per_iter > 0 and epoch_id % eval_in_train_per_iter == 0:
                logger.info(f'\nepoch {epoch_id}, data_id:{data_id}\n')
                eval_res_i = evaluate(dataset=data_loader.dataset,
                                      predictions=predictions_i,
                                      iou_thresh_eval=iou_thresh_eval,
                                      output_folder=eval_out_dir,
                                      box_only=False)

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        with autograd.detect_anomaly():
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 1 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        avg_loss = meters.loss.avg
        tmp_p = max(int(checkpoint_period // 10), 20)
        if iteration % tmp_p == 0 and avg_loss < min_loss:
            checkpointer.save("model_min_loss", **arguments)
            logger.info(f'\nmin loss: {avg_loss} at {iteration}\n')
            min_loss = avg_loss

        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)\n".format(
        total_time_str, total_training_time / (max_iter)))

    if eval_in_train > 0 and epoch_id % eval_in_train == 0:
        logger.info(f'\nepoch {epoch_id}\n')
        preds = down_sample_for_eval_training(predictions_all)
        eval_res = evaluate(dataset=data_loader.dataset,
                            predictions=preds,
                            iou_thresh_eval=iou_thresh_eval,
                            output_folder=eval_out_dir,
                            box_only=False,
                            epoch=epoch_id,
                            is_train=True,
                            eval_aug_thickness=eval_aug_thickness)
        pass
    return min_loss
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #16
0
def main():

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    # model = train(cfg, args.local_rank, args.distributed)
    model = build_detection_model(cfg)
    # add
    print(model)
    all_index = []
    for index, item in enumerate(model.named_parameters()):
        all_index.append(index)
        print(index)
        print(item[0])
        print(item[1].size())
    print("All index of the model: ", all_index)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=args.distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # run_test(cfg, model, args.distributed)
    # pruning
    m = Mask(model)
    m.init_length()
    m.init_length()
    print("-" * 10 + "one epoch begin" + "-" * 10)
    print("remaining ratio of pruning : Norm is %f" % args.rate_norm)
    print("reducing ratio of pruning : Distance is %f" % args.rate_dist)
    print("total remaining ratio is %f" % (args.rate_norm - args.rate_dist))

    m.modelM = model
    m.init_mask(args.rate_norm, args.rate_dist)

    m.do_mask()
    m.do_similar_mask()
    model = m.modelM
    m.if_zero()
    # run_test(cfg, model, args.distributed)

    # change to use straightforward function to make its easy to implement Mask
    # do_train(
    #     model,
    #     data_loader,
    #     optimizer,
    #     scheduler,
    #     checkpointer,
    #     device,
    #     checkpoint_period,
    #     arguments,
    # )
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)
        # print("Loss dict",loss_dict)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()

        # prun
        # Mask grad for iteration
        m.do_grad_mask()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        # prun
        # 7375 is number iteration to train 1 epoch with batch-size = 16 and number train dataset exam is 118K (in coco)
        if iteration % args.iter_pruned == 0 or iteration == cfg.SOLVER.MAX_ITER - 5000:
            m.modelM = model
            m.if_zero()
            m.init_mask(args.rate_norm, args.rate_dist)
            m.do_mask()
            m.do_similar_mask()
            m.if_zero()
            model = m.modelM
            if args.use_cuda:
                model = model.cuda()
            #run_test(cfg, model, args.distributed)

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
Exemple #17
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(
                f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}"
            )
            continue
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(losses, optimizer) as scaled_losses:
            scaled_losses.backward()
        optimizer.step()
        scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #18
0
def do_face_train_triplet(
    cfg,
    model,
    data_loader,
    data_loader_val,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    arguments,
    divs_nums,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    dataset_names = cfg.DATASETS.TEST
    for iteration, (img_a, img_p, img_n, label_p,
                    label_n) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration
        img_a_list, _ = divs_tensors(device=device,
                                     tensors=img_a,
                                     targets=None,
                                     divs_nums=divs_nums)
        img_p_list, label_p_list = divs_tensors(device=device,
                                                tensors=img_p,
                                                targets=label_p,
                                                divs_nums=divs_nums)
        img_n_list, label_n_list = divs_tensors(device=device,
                                                tensors=img_n,
                                                targets=label_n,
                                                divs_nums=divs_nums)
        ####======== 拆分batch 可能对bn层有影响 ==========####
        optimizer.zero_grad()
        for img_a, img_p, img_n, label_p, label_n in zip(
                img_a_list, img_p_list, img_n_list, label_p_list,
                label_n_list):
            loss_dict = model(tensors=[img_a, img_p, img_n],
                              targets=[label_p, label_n],
                              batch=iteration,
                              total_batch=None)
            losses = sum(loss for loss in loss_dict.values())
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(loss=losses_reduced, **loss_dict_reduced)
            losses /= divs_nums
            with amp.scale_loss(losses, optimizer) as scaled_losses:
                scaled_losses.backward()
        optimizer.step()
        scheduler.step()
        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
            if iteration > 40000:
                checkpointer.save_backbone("BACKBONE_{:07d}".format(iteration))
        #####========= data test ============#######
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg,
                                 is_train=False,
                                 is_distributed=(get_world_size() > 1),
                                 is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False
                if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=None,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val,
                                    _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join([
                    "[Validation]: ",
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)
            checkpointer.save_backbone("model_final")
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #19
0
class Evaler(nn.Module):
    """ engine/container for encoder, decoder and all DMM modules:
        match_layer
        feature_extractor
    """
    def __init__(self, DMM, encoder, decoder, args, dmmcfgs):
        super(Evaler, self).__init__()
        self.meters = MetricLogger(delimiter=" ")
        self.decoder = decoder
        self.encoder = encoder
        self.DMM = DMM
        if args.load_proposals and not args.load_proposals_dataset:
            logging.info('load %s' % args.pred_offline_meta)
            self.pred_offline_meta = json.load(
                open(args.pred_offline_meta, 'r'))
            if 'vidfid2index' in self.pred_offline_meta:
                self.pred_offline_meta = self.pred_offline_meta['vidfid2index']
        if args.use_gpu:
            self.encoder.cuda()
            self.decoder.cuda()
            self.DMM.cuda()
        timestr = time.strftime('%m-%d')
        model_name = args.model_name.strip('/')
        model_id = model_name.split('/')[-2] + '/epo' + model_name.split(
            '/')[-1].split('epo')[-1]
        self.eval_output_root = '%s/eval/%s/%s/L%s_%d_s%s/' % (
            args.models_root, timestr, model_id, args.eval_flag,
            args.test_image_h, args.eval_split)
        self.eval_output_root = self.eval_output_root.replace('__', '_')
        timestr = time.strftime('%m-%d-%H-%M')
        save_config_dir = '%s/save_config/%s/' % (self.eval_output_root,
                                                  timestr)
        isrank0 = args.local_rank == 0
        if isrank0:
            if not os.path.exists(save_config_dir):
                make_dir(save_config_dir)
            yaml.dump(
                args, open(os.path.join(save_config_dir, 'eval_args.yaml'),
                           'w'))
            yaml.dump(
                dmmcfgs,
                open(os.path.join(save_config_dir, 'eval_dmm_config.yaml'),
                     'w'))
        json_file = open(get_db_path(args.eval_split), 'r')
        self.seq_dir = get_img_path(args.eval_split)
        self.anno_dir = get_anno_path(args.eval_split)
        self.data = json.load(json_file)
        if isrank0: logging.info('num vid %d' % len(self.data['videos']))
        self.DMM.eval()
        self.encoder.eval()
        self.decoder.eval()

    def forward(self, batch_idx, inputs, imgs_names, targets, seq_name, args,
                proposals_input):
        """ Evaluation 
        forward a batch of clip
        """
        if args.pad_video:
            CHECK4D(targets)  # B,len,O,HW
            CHECK5D(inputs)  # B,len,O,H,W
        device_id = torch.cuda.current_device()
        if args.batch_size == 1:
            if not args.distributed: seq_name = [seq_name[device_id]]
        else:
            batch_size_device = int(args.batch_size / args.ngpus)
            if not args.distributed:
                seq_name = seq_name[device_id *
                                    batch_size_device:(1 + device_id) *
                                    batch_size_device]
        CHECKEQ(len(seq_name), len(inputs))
        njpgs_batch, img_shape, frame_names_batch = self.prepare_frame_names(
            seq_name)
        # send batch to GPU
        prev_thid_list = None
        B, nframe, O, H, W = inputs.shape
        max_length_clip = min(nframe, args.length_clip)
        for frame_idx in range(max_length_clip):
            tic = time.time()
            extra_frame = [njpgs <= frame_idx for njpgs in njpgs_batch]
            proposal_cur, predid_cur_frames = None, None
            if args.load_proposals and proposals_input is None:
                predid = [0] * len(seq_name)
                for b, seq_n in enumerate(seq_name):
                    if extra_frame[b]:
                        predid[b] = len(self.encoder.pred_offline) - 1
                    else:
                        frame_name = imgs_names[b][frame_idx]  # tuple + b name
                        predid[b] = int(
                            self.pred_offline_meta[seq_n][frame_name])
                predid_cur_frames = predid
            elif proposals_input is not None:
                proposal_cur = []
                for b in range(B):
                    if len(proposals_input[b]) > frame_idx:
                        proposal_cur.append(proposals_input[b][frame_idx])
                    else:
                        proposal_cur.append(proposals_input[b][-1])
            x = inputs[:, frame_idx]  # B,1->0,O,H,W, select 1 from clip len
            Bx, Cx, Hx, Wx = x.shape
            # targets shape: B,len,O,H*W
            # input shape:   B,len,3(Cx),H,W
            y_mask = targets[:, frame_idx][:, :, :-1].float()
            CHECKEQ(Hx * Wx, y_mask.shape[-1])
            CHECKEQ(Bx, y_mask.shape[0])
            B, O, HW = CHECK3D(y_mask)
            CHECKEQ(Bx, B)
            if frame_idx == 0:
                mask_hist = None
                tplt_dict, tplt_valid_batch, proposals = \
                   self.forward_timestep_init(args, x, y_mask, predid_cur_frames, proposal_cur)
                prev_thid_list, thid_list = None, None
                prev_mask = y_mask.view(B, O, HW)
                outs = y_mask
                init_pred_inst = y_mask.view(B, O, H, W)
                infos = {
                    'args': args,
                    'shape': img_shape,
                    'extra_frame': extra_frame,
                    'valid': tplt_valid_batch,
                    'predid': predid_cur_frames
                }
                _, prev_thid_list, _, _, _  = self.inference_timestep(infos, tplt_dict, x, y_mask, \
                    prev_thid_list=prev_thid_list, prev_mask=prev_mask, mask_hist=mask_hist, proposal_cur=proposal_cur)
            else:
                # ---- start inference of current batch ----
                infos = {
                    'args': args,
                    'shape': img_shape,
                    'extra_frame': extra_frame,
                    'valid': tplt_valid_batch,
                    'predid': predid_cur_frames
                }
                outs, thid_list, init_pred_inst, proposals, mask_hist = self.inference_timestep(
                    infos,
                    tplt_dict,
                    x,
                    y_mask,
                    prev_thid_list=prev_thid_list,
                    prev_mask=prev_mask,
                    mask_hist=mask_hist,
                    proposal_cur=proposal_cur)
                self.meters.update(ft=time.time() - tic)
                prev_mask = outs.view(B, O, HW)
                if args.only_spatial == False:
                    prev_thid_list = thid_list
                prev_mask = outs.view(B, O, HW) if frame_idx > 0 else y_mask
            # ---------------- save merged mask ----------------------
            for b in range(B):
                if extra_frame[b]: continue  # skip the extra frames
                saved_name = self.eval_output_root + 'merged/%s/%s.png' % (
                    seq_name[b], frame_names_batch[b][frame_idx])
                obj_index = tplt_valid_batch[b].sum()
                refine_mask = outs[b, :obj_index].view(-1, H * W)
                refine_bg = 1 - refine_mask.max(0)[0]
                refine_fbg = torch.cat(
                    [refine_bg.view(1, H, W),
                     refine_mask.view(-1, H, W)],
                    dim=0)
                max_v, max_i = refine_fbg.max(0)
                eval_helper.plot_scores_map(max_i.float(), saved_name)
            # ---------------- save outs to mask  ----------------------
            del outs, thid_list, x, y_mask, init_pred_inst
        if (batch_idx % 10 == 0 and args.local_rank == 0):
            logging.info('save at {}'.format(self.eval_output_root))
            logging.info(self.meters)

    def inference_timestep(self, infos, tplt_dict, x, y_mask, prev_thid_list,
                           prev_mask, mask_hist, proposal_cur):
        r""" inference for frames at current image step, 
        argument:
            infos: 'args','shape','extra_frame','valid','predid'}
                   img_shape: list, len=B, element: [h,w]
            x: shape: B,3,H W
            y_mask: B,O,H W
        return 
            init_pred_inst: BOHW, prediction from the mask branch, without refine  
            tplt_dict, 
            proposals, mask_hist_new #4,5,6,7,8    
        """
        args = infos['args']
        img_shape = infos['shape']
        extra_frame, tplt_valid_batch = infos['extra_frame'], infos['valid']
        hidden_spatial = None
        out_masks = []
        assert (isinstance(x, torch.Tensor))
        features, proposals, _ = self.encoder(
            args, x, predid_cur_frames=infos['predid'], proposals=proposal_cur)
        bone_feat = features['backbone_feature']  # B,Lev,(D,H,W);
        B, D, H, W = x.shape
        thid_list = []
        B, O, HW = CHECK3D(prev_mask)
        if mask_hist is None:
            mask_hist = prev_mask.view(B, O, H, W)
        assert ('mask' in proposals[0].fields())
        init_pred_inst, tplt_dict, match_loss, mask_hist_new \
            = self.DMM.inference(infos, proposals, bone_feat, mask_hist, tplt_dict )
        valid_num_obj_max = max(
            1, (tplt_valid_batch.sum(0) > 0).sum())  # shape: 1, O
        for t in range(0, valid_num_obj_max):
            if prev_thid_list is not None:
                hidden_temporal = prev_thid_list[t]
                if args.only_temporal:
                    hidden_spatial = None
            else:
                hidden_temporal = None
            mask_lstm = []
            maxpool = nn.MaxPool2d((2, 2), ceil_mode=True)
            prev_m_inst = torch.cat([
                prev_mask[:, t, :].view(B, 1, H * W), y_mask[:, t, :].view(
                    B, 1, H * W), init_pred_inst[:, t].view(B, 1, H * W)
            ],
                                    dim=2).view(B, 3, H,
                                                W)  # cat along new dim
            prev_m_inst = maxpool(prev_m_inst)
            for _ in range(len(features['refine_input_feat'])):
                prev_m_inst = maxpool(prev_m_inst)
                mask_lstm.append(prev_m_inst)
            mask_lstm = list(reversed(mask_lstm))
            out_mask, hidden = self.decoder(features['refine_input_feat'],
                                            mask_lstm, hidden_spatial,
                                            hidden_temporal)
            hidden_tmp = [hidden[ss][0] for ss in range(len(hidden))]
            hidden_spatial = hidden
            thid_list.append(hidden_tmp)
            upsample_match = nn.UpsamplingBilinear2d(size=(x.size()[-2],
                                                           x.size()[-1]))
            out_mask = upsample_match(out_mask)
            for b in range(B):  # should behave differently for differnet vid;
                is_template_valid_cur_b = tplt_valid_batch[b,
                                                           t]  # current batch
                if not is_template_valid_cur_b: continue
                mask_hist_new[b, t:t + 1, :, :] = torch.sigmoid(
                    out_mask[b])  # shape: B,O,H,W and B,1,H,W
            out_mask = out_mask.view(out_mask.size(0), -1)
            out_masks.append(out_mask)
            del mask_lstm, hidden_temporal, hidden_tmp, prev_m_inst, out_mask
        out_masks = torch.cat(out_masks, 1).view(out_masks[0].size(0),
                                                 len(out_masks), -1)  # B,O,HW
        outs = torch.sigmoid(out_masks)
        outs_pad = outs.new_zeros(B, O, HW)
        outs_pad[:, :valid_num_obj_max, :] = outs
        return outs_pad, thid_list, init_pred_inst, proposals, mask_hist_new

    def forward_timestep_init(self, args, x, y_mask, predid_cur_frames,
                              proposal_cur):
        features, proposals, cocoloss = self.encoder(
            args,
            x,
            predid_cur_frames=predid_cur_frames,
            proposals=proposal_cur)
        B, D, H, W = CHECK4D(x)
        tplt_valid_batch = []
        for b in range(B):
            prop, template_valid = ohw_mask2boxlist(y_mask[b].view(-1, H,
                                                                   W))  # OHW
            tplt_valid_batch.append(template_valid)  # append O
            proposals[b] = prop
        tplt_valid_batch = torch.stack(tplt_valid_batch, dim=0)
        tplt_dict = self.DMM.fill_template_dict(args, proposals, features,
                                                y_mask, tplt_valid_batch)
        return tplt_dict, tplt_valid_batch, proposals

    def prepare_frame_names(self, seq_name):
        njpgs_batch = []
        img_shape = []
        frame_names_batch = []
        for inx, seq_name_b in enumerate(seq_name):
            frame_names = np.sort(os.listdir(self.seq_dir + '/' + seq_name_b))
            frame_names = [
                os.path.splitext(os.path.basename(fullname))[0]
                for fullname in frame_names
            ]
            vid_img = np.array(
                Image.open(self.seq_dir + '/' + seq_name_b +
                           '/%s.jpg' % frame_names[0]))
            img_h, img_w, _ = vid_img.shape
            img_shape.append([img_h, img_w])
            seq_info = self.data['videos'][seq_name_b]['objects']
            frame_names_has_obj = []
            for obj_id in seq_info.keys():  # loop over all objects
                for frame_name in seq_info[obj_id]['frames']:
                    if frame_name not in frame_names_has_obj:  # add if this a new frames
                        frame_names_has_obj.append(frame_name)
            start_annotation_frame = frame_names_has_obj[0]
            id_start = frame_names.index(start_annotation_frame)
            if id_start != 0:
                logging.warning('find a video annotation not start from the first frame in ' + \
                                'rgb images :{}; {}'.format(seq_name_b,frame_names[0]))
                frame_names = frame_names[id_start:]
            frame_names_batch.append(frame_names)
            njpgs = len(frame_names)
            njpgs_batch.append(njpgs)
        return njpgs_batch, img_shape, frame_names_batch
Exemple #20
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    output_dir,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    writer = SummaryWriter(log_dir=os.path.join(output_dir, 'run'))

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        # add tensorboard -- tuo
        if iteration % 20 == 0:
            for name, meter in meters.meters.items():
                if 'loss' in name:
                    writer.add_scalar(name, meter.avg, iteration)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #21
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    per_iter_start_callback_fn=None,
    per_iter_end_callback_fn=None,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if per_iter_start_callback_fn is not None:
            per_iter_start_callback_fn(iteration=iteration)

        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        losses.backward()

        optimizer.step()
        optimizer.zero_grad()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0 and arguments["save_checkpoints"]:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter and arguments["save_checkpoints"]:
            checkpointer.save("model_final", **arguments)

        # per-epoch work (testing)
        if per_iter_end_callback_fn is not None:
            # Note: iteration has been incremented previously for
            # human-readable checkpoint names (i.e. 60000 instead of 59999)
            # so need to adjust again here
            early_exit = per_iter_end_callback_fn(iteration=iteration - 1)
            if early_exit:
                break

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
    if per_iter_end_callback_fn is not None:
        if early_exit:
            return True
        else:
            return False
    else:
        return None
Exemple #22
0
def train(cfg, local_rank, distributed, d_path=None):

    MaskDnet = MaskDiscriminator(nc=256)
    BBoxDnet = BoxDiscriminator(nc=256, ndf=64)
    Dnet = CombinedDiscriminator(MaskDnet, BBoxDnet)
    model = Mask_RCNN(cfg)
    g_rcnn = GAN_RCNN(model, Dnet)

    device = torch.device(cfg.MODEL.DEVICE)
    g_rcnn.to(device)

    g_optimizer = make_optimizer(cfg, model)
    d_optimizer = make_D_optimizer(cfg, Dnet)

    g_scheduler = make_lr_scheduler(cfg, g_optimizer)
    d_scheduler = make_lr_scheduler(cfg, d_optimizer)
    # model.BoxDnet = BBoxDnet

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, g_optimizer = amp.initialize(model, g_optimizer, opt_level=amp_opt_level)
    Dnet, d_optimizer = amp.initialize(Dnet, d_optimizer, opt_level=amp_opt_level)

    if distributed:
        g_rcnn = torch.nn.parallel.DistributedDataParallel(
                    g_rcnn, device_ids=[local_rank], output_device=local_rank,
                    # this should be removed if we update BatchNorm stats
                    broadcast_buffers=False,
                )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, g_optimizer, g_scheduler, output_dir, save_to_disk
    )

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)

    arguments.update(extra_checkpoint_data)

    d_checkpointer = DetectronCheckpointer(
        cfg, Dnet, d_optimizer, d_scheduler, output_dir, save_to_disk
    )

    if d_path:
        d_checkpointer.load(d_path, use_latest=False)

    data_loader = make_data_loader(
            cfg,
            is_train=True,
            is_distributed=distributed,
            start_iter=arguments["iteration"],
        )

    test_period = cfg.SOLVER.TEST_PERIOD
    data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    ## START TRAINING
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")

    meters = TensorboardLogger(
            log_dir=cfg.OUTPUT_DIR + "/tensorboardX",
            start_iter=arguments['iteration'],
            delimiter="  ")

    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    g_rcnn.train()
    start_training_time = time.time()
    end = time.time()

    iou_types = ("bbox",)
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm",)

    dataset_names = cfg.DATASETS.TEST

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" )
            continue
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        g_loss_dict, d_loss_dict = g_rcnn(images, targets)

        g_losses = sum(loss for loss in g_loss_dict.values())
        d_losses = sum(loss for loss in d_loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        g_loss_dict_reduced = reduce_loss_dict(g_loss_dict)
        g_losses_reduced = sum(loss for loss in g_loss_dict_reduced.values())
        
        d_loss_dict_reduced = reduce_loss_dict(d_loss_dict)
        d_losses_reduced = sum(loss for loss in d_loss_dict_reduced.values())
        
        meters.update(total_g_loss=g_losses_reduced, **g_loss_dict_reduced)
        meters.update(total_d_loss=d_losses_reduced, **d_loss_dict_reduced)

        g_optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(g_losses, g_optimizer) as g_scaled_losses:
            g_scaled_losses.backward()
        g_optimizer.step()
        g_scheduler.step()
        
        
        d_optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(d_losses, d_optimizer) as d_scaled_losses:
            d_scaled_losses.backward()
        d_optimizer.step()
        d_scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=g_optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
            d_checkpointer.save("dnet_{:07d}".format(iteration), **arguments)
            
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg, is_train=False, is_distributed=False, is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=cfg.OUTPUT_DIR,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join(
                    [
                        "[Validation]: ",
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=g_optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #23
0
def do_train(model, data_loader, data_loader_val, optimizer, scheduler,
             checkpointer, device, checkpoint_period, vis_period, arguments,
             cfg, tb_writer, distributed):
    from tools.train_net import run_test
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    vis_num = 0
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(
            v * cfg.SOLVER.LOSS_WEIGHT.MASK_WEIGHT if k == 'loss_mask' else v *
            cfg.SOLVER.LOSS_WEIGHT.BOX_WEIGHT for k, v in loss_dict.items())
        # losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        loss_dict_reduced = {
            k: (v *
                cfg.SOLVER.LOSS_WEIGHT.MASK_WEIGHT if k == 'loss_mask' else v *
                cfg.SOLVER.LOSS_WEIGHT.BOX_WEIGHT)
            for k, v in loss_dict_reduced.items()
        }

        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        if tb_writer:
            tb_writer.add_scalars('train/Losses',
                                  loss_dict_reduced,
                                  global_step=iteration)
            tb_writer.add_scalar('train/Loss',
                                 losses_reduced,
                                 global_step=iteration)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if cfg.SOLVER.VIS_ON and iteration % vis_period == 0:
            # visualize predict box
            # set model to eval mode
            model.eval()
            vis_image, vis_image_transformed, target = data_loader_val.dataset.get_image(
                vis_num)
            image_list = to_image_list(vis_image_transformed,
                                       cfg.DATALOADER.SIZE_DIVISIBILITY)
            image_list = image_list.to(device)
            cpu_device = torch.device("cpu")
            with torch.no_grad():
                predictions = model(image_list)
                predictions = [o.to(cpu_device) for o in predictions]

            # only one picture
            predictions = predictions[0]
            top_predictions = select_topn_predictions(predictions, 3)

            # visualize
            result = vis_image.copy()
            result = overlay_boxes_cls_names(result, top_predictions, target)

            result = torch.from_numpy(result)
            result = result.permute(2, 0, 1)[None, :, :, :]
            result = make_grid([result])
            if tb_writer:
                tb_writer.add_image('Image_train', result, iteration)
            synchronize()
            model.train()
            vis_num += 1
            vis_num %= len(data_loader_val.dataset)
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

            # eval
            model.eval()
            results = run_test(cfg,
                               model,
                               distributed,
                               iter=iteration,
                               valid=True)
            if tb_writer:
                for result in results:
                    for k, v in result.items():
                        tb_writer.add_scalar('valid/{}'.format(k),
                                             v,
                                             global_step=iteration)
            synchronize()
            model.train()

        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #24
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    summary_writer,
    device,
    checkpoint_period,
    summary_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

        if iteration % summary_period == 0:
            summary_writer.add_image(
                'input_image',
                vutils.make_grid(images.tensors[:, [2, 1, 0]], normalize=True),
                iteration)
            summary_writer.add_scalar('learning_rate',
                                      optimizer.param_groups[0]['lr'],
                                      iteration)
            summary_writer.add_scalar(
                'model/loss_rpn_box_reg',
                loss_dict_reduced['loss_rpn_box_reg'].item(), iteration)
            summary_writer.add_scalar('model/loss_mask',
                                      loss_dict_reduced['loss_mask'].item(),
                                      iteration)
            summary_writer.add_scalar('model/loss_box_reg',
                                      loss_dict_reduced['loss_box_reg'].item(),
                                      iteration)
            summary_writer.add_scalar(
                'model/loss_classifier',
                loss_dict_reduced['loss_classifier'].item(), iteration)
            if 'loss_maskiou' in loss_dict_reduced:
                summary_writer.add_scalar(
                    'model/loss_maskiou',
                    loss_dict_reduced['loss_maskiou'].item(), iteration)
            summary_writer.add_scalar(
                'model/loss_objectness',
                loss_dict_reduced['loss_objectness'].item(), iteration)
            summary_writer.add_scalar('model/loss', losses_reduced.item(),
                                      iteration)

        iteration = iteration + 1

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #25
0
def train(cfg,
          local_rank,
          distributed,
          model_config=None,
          use_tensorboard=True):
    model = build_detection_model(cfg, model_config)
    if get_rank() == 0:
        if 'search' in cfg.MODEL.BACKBONE.CONV_BODY:
            print('backbone search space:', blocks_key)
        else:
            print('backbone:', cfg.MODEL.BACKBONE)
        if 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR or 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH:
            print('head search space:', head_ss_keys)
        else:
            print('head:', cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR,
                  cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH)
        if 'search' in cfg.MODEL.INTER_MODULE.NAME:
            print('inter search space:', inter_ss_keys)
        else:
            print('inter:', cfg.MODEL.INTER_MODULE.NAME)
        print(model)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer, lr_dict = make_optimizer(cfg, model)
    if get_rank() == 0:
        for item in lr_dict:
            print(item)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    if not ('search' in cfg.MODEL.BACKBONE.CONV_BODY
            or 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR
            or 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH):
        use_mixed_precision = cfg.DTYPE == "float16"
        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True)

    # if 'search' in cfg.MODEL.BACKBONE.CONV_BODY:
    #     def forward_hook(module: Module, inp: (Tensor,)):
    #         if module.weight is not None:
    #             module.weight.requires_grad = True
    #         if module.bias is not None:
    #             module.bias.requires_grad = True

    #     all_modules = (nn.Conv2d, nn.Linear, nn.BatchNorm2d, nn.GroupNorm, ) # group norm更新!!
    #     for m in model.modules():
    #         if isinstance(m, all_modules):
    #             m.register_forward_pre_hook(forward_hook)

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    test_period = cfg.SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(cfg,
                                           is_train=False,
                                           is_distributed=distributed,
                                           is_for_period=True)
    else:
        data_loader_val = None

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    if use_tensorboard:
        meters = TensorboardLogger(cfg=cfg,
                                   log_dir=cfg.TENSORBOARD_EXPERIMENT,
                                   start_iter=arguments['iteration'],
                                   delimiter="  ")
    else:
        meters = MetricLogger(delimiter="  ")

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
        meters,
    )

    return model
Exemple #26
0
def do_train(
    cfg,
    model,
    data_loader,
    data_loader_val,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    if cfg.MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints", )
    dataset_names = cfg.DATASETS.TEST

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(
                f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}"
            )
            continue
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        # with amp.scale_loss(losses, optimizer) as scaled_losses:
        #     scaled_losses.backward()
        losses.backward()
        optimizer.step()
        scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg,
                                 is_train=False,
                                 is_distributed=(get_world_size() > 1),
                                 is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False
                if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=None,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val,
                                    _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join([
                    "[Validation]: ",
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #27
0
def do_train(
    reid_model,
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    reid_model.eval()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]
        result, loss_dict = model(images, targets)
        images_reid, labels_reid = resize_to_image(images.tensors, targets,
                                                   result)
        if images_reid is None:
            # pass
            loss_dict.update(
                dict(cls_loss=torch.tensor(0).type_as(
                    loss_dict['loss_classifier'])))
            loss_dict.update(
                dict(tri_loss=torch.tensor(0).type_as(
                    loss_dict['loss_classifier'])))
        else:
            images_reid = [o.to(device) for o in images_reid]
            labels_reid = labels_reid.to(device)
            loss_dict = reid_model(images_reid, labels_reid, iteration,
                                   'train', loss_dict)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # with amp.scale_loss(losses, optimizer) as scaled_loss:
        #     scaled_loss.backward()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
def do_da_train(model, source_data_loader, target_data_loader, optimizer,
                scheduler, checkpointer, device, checkpoint_period, arguments,
                cfg):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter=" ")
    # todo li max_iter怎么会在这里?
    max_iter = len(source_data_loader)
    start_iter = arguments["iteration"]
    # 设置为训练模式,不是直接开始训练
    model.train()
    start_training_time = time.time()
    end = time.time()
    # 和SHOT的代码相比,这个代码里面没有对source_data_loader进行一个for循环迭代,是因为zip函数实现了
    # 但是这个iteration就不是epoch了,每次iteration就是对每次从DataLoader里面出来的数据的一次迭代,并不是整个数据集的一次迭代
    for iteration, ((source_images, source_targets, idx1), (target_images, target_targets, idx2))\
            in enumerate(zip(source_data_loader, target_data_loader), start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration

        # 源数据和目标数据
        images = (source_images + target_images).to(device)
        targets = [
            target.to(device)
            for target in list(source_targets + target_targets)
        ]

        # 正向传播
        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # 反向传播
        losses.backward()
        optimizer.step()

        scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        # 迭代20次,log一次
        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter - 1:
            checkpointer.save("model_final", **arguments)
        if torch.isnan(losses_reduced).any():
            logger.critical('Loss is NaN, exiting...')
            return

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))


# DA end
Exemple #29
0
def do_train(model,
             data_loader,
             optimizer,
             scheduler,
             checkpointer,
             device,
             checkpoint_period,
             arguments,
             tb_log_dir,
             use_tensorboard=False):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = TensorboardLogger(log_dir=tb_log_dir,
                               delimiter="  ") \
             if use_tensorboard else MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]
        loss_dict, preds = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(iteration, loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()

        accum_grad = 0
        for p in list(filter(lambda p: p.grad is not None,
                             model.parameters())):
            accum_grad += p.grad.data.norm(2).item()

        if iteration > 500 and accum_grad > 200:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 200)
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(iteration, time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == (max_iter - 1):

            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration % checkpoint_period == 0:
            meters.update_image(iteration, images.tensors[0], preds[0],
                                targets[0])
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #30
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        # print('images: ', images)
        # print('targets: ', targets, targets[0].bbox)
        # print('targets: ', type(targets[0]), type(targets))
        data_time = time.time() - end
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        # print('images.size(): ', images.tensors.size(), images.image_sizes)
        # print('targets: ', targets)
        loss_dict = model(images=images,
                          iteration=iteration + 1,
                          targets=targets)
        # print('loss_dict: ', loss_dict)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if (iteration + 1) % 20 == 0 or (iteration + 1) == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration + 1,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if (iteration + 1) % checkpoint_period == 0 or (iteration +
                                                        1) == max_iter:
            checkpointer.save("model_{:07d}".format(iteration + 1),
                              **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #31
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    tb_logger,
    cfg,
):
    print('111111111111111111111')
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    print('2222222222222222222222')
    logger.info("Start training")
    print('4444444444444444444444')
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    kkk = 0
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration
        #print(kkk)
        kkk += 1
        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        if cfg.SOLVER.USE_ADAM:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == (max_iter - 1):
            #print(kkk * 10000000)
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
            if is_main_process():
                for tag, value in loss_dict_reduced.items():
                    tb_logger.scalar_summary(tag, value.item(), iteration)
        if iteration % checkpoint_period == 0 and iteration > 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )