Exemple #1
0
def finetune_first_image(model, images, targets, optimizer,scheduler, logger, cfg):
    total_iter_finetune = cfg.FINETUNE.TOTAL_ITER
    model.train()
    meters = MetricLogger(delimiter="  ")
    for iteration in range(total_iter_finetune):

        scheduler.step()
        loss_dict, _ = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(total_loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        meters.update(lr=optimizer.param_groups[0]["lr"])

        if iteration % (total_iter_finetune / 2) == 0 :
            logger.info(
                meters.delimiter.join(
                    [
                        "{meters}",
                    ]
                ).format(
                    meters=str(meters),
                )
            )

    model.eval()
    return model
    def test_update(self):
        meter = MetricLogger()
        for i in range(10):
            meter.update(metric=float(i))

        m = meter.meters["metric"]
        self.assertEqual(m.count, 10)
        self.assertEqual(m.total, 45)
        self.assertEqual(m.median, 4)
        self.assertEqual(m.avg, 4.5)
def evaluator(cfg,args,model,device,iteration):
    meters_val = MetricLogger(delimiter="  ")

    data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=False)[0]
        
    with torch.no_grad():
        # Should be one image for each GPU:
        print('Calculating evaluation loss.')
        for iteration_val, batch in enumerate(data_loader_val):
            #if is_main_process():
            #    print(iteration_val)
            if args.debug and iteration_val>10:
                break
            images_val, targets_val, _ = batch
            
            skip_batch=False
            nbox=[]
            for t in targets_val:
                nbox.append(len(t))
                if len(t)<1:
                    skip_batch=True
                    break
            if skip_batch:
                continue
            try:
                print(iteration_val,nbox)
                images_val = images_val.to(device)
                targets_val = [target.to(device) for target in targets_val]
                loss_dict = model(images_val, targets_val)
                losses = sum(loss for loss in loss_dict.values())
                loss_dict_reduced = reduce_loss_dict(loss_dict)
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            except:
                print('Warning: ground truth error.')
    
        #synchronize()

        if is_main_process():
            print('Save evaluation loss to tensorboard.')
            for name, meter in meters_val.meters.items():
                print(name,meter.global_avg)
                args.writer.add_scalar('EvalMetrics/'+name, meter.global_avg, iteration / args.iters_per_epoch)
            print('Pass')
Exemple #4
0
def do_val(model=None, data_loader_val=None, device=None):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    meters = MetricLogger(delimiter="  ")

    for images, targets, _ in data_loader_val:

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        with torch.no_grad():
            loss_dict = model(images, targets)

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

    logger.info(meters.delimiter.join(["Val {meters}"]).format(meters=meters.str_avg(),))

    return meters
Exemple #5
0
def validation(model, data_loader, device, logger, tensorboard_logger,
               iteration):
    logger.info('-' * 40)
    logger.info("Start Validation")
    meters = MetricLogger(delimiter="  ")
    start_validation_time = time.time()

    max_iter = len(data_loader)

    for idx, batch in enumerate(tqdm(data_loader)):
        images, targets, _ = batch
        images = images.to(device)
        targets = [target.to(device) for target in targets]

        with torch.no_grad():
            loss_dict, _ = model(images, targets)
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(total_loss=losses_reduced, **loss_dict_reduced)

    tensorboard_logger.write(meters, iteration, phase='Valid')
    logger.info('Validation:')
    logger.info(
        meters.delimiter.join([
            "iter: {iter}",
            "{meters}",
            "max mem: {memory:.0f}",
        ]).format(
            iter=iteration,
            meters=str(meters),
            memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
        ))

    total_validation_time = time.time() - start_validation_time
    total_time_str = str(datetime.timedelta(seconds=total_validation_time))
    logger.info("Total Validation time: {} ({:.4f} s / it)".format(
        total_time_str, total_validation_time / (max_iter)))
    logger.info('-' * 40)
Exemple #6
0
    def run_eval(self):
        if isrank0:
            logging.info("Dataset is %s; len of loader %d" %
                         (self.dataset, len(self.loader)))
            logging.info("Split is %s" % (self.split))
        meters = MetricLogger(delimiter=" ")

        # loop over data loader
        start = time.time()
        # -------------------   forward model ------------------------------
        for batch_idx, (inputs, imgs_names, targets, seq_name,
                        starting_frame) in enumerate(self.loader):
            meters.update(dT=time.time() - start)
            if batch_idx % 5 == 0:
                logging.info('[{}] {}/{};{} '.format(
                    args.distributed_manully_rank, batch_idx, len(self.loader),
                    meters))
            targets = targets.cuda()  # use our collate function
            inputs = inputs.cuda()
            cur_device = inputs.device
            CHECK4D(targets)  # B, Len, O, HW
            CHECK5D(inputs)  # B Len D H W
            if args.load_proposals_dataset:
                proposals_cur_batch = imgs_names
                proposals = []
                for proposal_cur_vid in proposals_cur_batch:
                    boxlist = list(
                        proposal_cur_vid)  # BoxList of current batch
                    boxlist = [b.to(cur_device) for b in boxlist]
                    proposals.append(boxlist)  # BoxList of current batch
                imgs_names = None
            else:
                proposals = None
            with torch.no_grad():
                self.evaler(batch_idx, inputs, imgs_names, targets, seq_name,
                            args, proposals)
            meters.update(bT=time.time() - start)
            start = time.time()
Exemple #7
0
def main():

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    # model = train(cfg, args.local_rank, args.distributed)
    model = build_detection_model(cfg)
    # add
    print(model)
    all_index = []
    for index, item in enumerate(model.named_parameters()):
        all_index.append(index)
        print(index)
        print(item[0])
        print(item[1].size())
    print("All index of the model: ", all_index)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=args.distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # run_test(cfg, model, args.distributed)
    # pruning
    m = Mask(model)
    m.init_length()
    m.init_length()
    print("-" * 10 + "one epoch begin" + "-" * 10)
    print("remaining ratio of pruning : Norm is %f" % args.rate_norm)
    print("reducing ratio of pruning : Distance is %f" % args.rate_dist)
    print("total remaining ratio is %f" % (args.rate_norm - args.rate_dist))

    m.modelM = model
    m.init_mask(args.rate_norm, args.rate_dist)

    m.do_mask()
    m.do_similar_mask()
    model = m.modelM
    m.if_zero()
    # run_test(cfg, model, args.distributed)

    # change to use straightforward function to make its easy to implement Mask
    # do_train(
    #     model,
    #     data_loader,
    #     optimizer,
    #     scheduler,
    #     checkpointer,
    #     device,
    #     checkpoint_period,
    #     arguments,
    # )
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)
        # print("Loss dict",loss_dict)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()

        # prun
        # Mask grad for iteration
        m.do_grad_mask()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        # prun
        # 7375 is number iteration to train 1 epoch with batch-size = 16 and number train dataset exam is 118K (in coco)
        if iteration % args.iter_pruned == 0 or iteration == cfg.SOLVER.MAX_ITER - 5000:
            m.modelM = model
            m.if_zero()
            m.init_mask(args.rate_norm, args.rate_dist)
            m.do_mask()
            m.do_similar_mask()
            m.if_zero()
            model = m.modelM
            if args.use_cuda:
                model = model.cuda()
            #run_test(cfg, model, args.distributed)

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
Exemple #8
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(
                f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}"
            )
            continue
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(losses, optimizer) as scaled_losses:
            scaled_losses.backward()
        optimizer.step()
        scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #9
0
def do_face_train_triplet(
    cfg,
    model,
    data_loader,
    data_loader_val,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    arguments,
    divs_nums,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    dataset_names = cfg.DATASETS.TEST
    for iteration, (img_a, img_p, img_n, label_p,
                    label_n) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration
        img_a_list, _ = divs_tensors(device=device,
                                     tensors=img_a,
                                     targets=None,
                                     divs_nums=divs_nums)
        img_p_list, label_p_list = divs_tensors(device=device,
                                                tensors=img_p,
                                                targets=label_p,
                                                divs_nums=divs_nums)
        img_n_list, label_n_list = divs_tensors(device=device,
                                                tensors=img_n,
                                                targets=label_n,
                                                divs_nums=divs_nums)
        ####======== 拆分batch 可能对bn层有影响 ==========####
        optimizer.zero_grad()
        for img_a, img_p, img_n, label_p, label_n in zip(
                img_a_list, img_p_list, img_n_list, label_p_list,
                label_n_list):
            loss_dict = model(tensors=[img_a, img_p, img_n],
                              targets=[label_p, label_n],
                              batch=iteration,
                              total_batch=None)
            losses = sum(loss for loss in loss_dict.values())
            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(loss=losses_reduced, **loss_dict_reduced)
            losses /= divs_nums
            with amp.scale_loss(losses, optimizer) as scaled_losses:
                scaled_losses.backward()
        optimizer.step()
        scheduler.step()
        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
            if iteration > 40000:
                checkpointer.save_backbone("BACKBONE_{:07d}".format(iteration))
        #####========= data test ============#######
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg,
                                 is_train=False,
                                 is_distributed=(get_world_size() > 1),
                                 is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False
                if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=None,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val,
                                    _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join([
                    "[Validation]: ",
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)
            checkpointer.save_backbone("model_final")
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #10
0
def do_train(model, data_loader, data_loader_val, optimizer, scheduler,
             checkpointer, device, checkpoint_period, vis_period, arguments,
             cfg, tb_writer, distributed):
    from tools.train_net import run_test
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    vis_num = 0
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(
            v * cfg.SOLVER.LOSS_WEIGHT.MASK_WEIGHT if k == 'loss_mask' else v *
            cfg.SOLVER.LOSS_WEIGHT.BOX_WEIGHT for k, v in loss_dict.items())
        # losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        loss_dict_reduced = {
            k: (v *
                cfg.SOLVER.LOSS_WEIGHT.MASK_WEIGHT if k == 'loss_mask' else v *
                cfg.SOLVER.LOSS_WEIGHT.BOX_WEIGHT)
            for k, v in loss_dict_reduced.items()
        }

        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        if tb_writer:
            tb_writer.add_scalars('train/Losses',
                                  loss_dict_reduced,
                                  global_step=iteration)
            tb_writer.add_scalar('train/Loss',
                                 losses_reduced,
                                 global_step=iteration)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if cfg.SOLVER.VIS_ON and iteration % vis_period == 0:
            # visualize predict box
            # set model to eval mode
            model.eval()
            vis_image, vis_image_transformed, target = data_loader_val.dataset.get_image(
                vis_num)
            image_list = to_image_list(vis_image_transformed,
                                       cfg.DATALOADER.SIZE_DIVISIBILITY)
            image_list = image_list.to(device)
            cpu_device = torch.device("cpu")
            with torch.no_grad():
                predictions = model(image_list)
                predictions = [o.to(cpu_device) for o in predictions]

            # only one picture
            predictions = predictions[0]
            top_predictions = select_topn_predictions(predictions, 3)

            # visualize
            result = vis_image.copy()
            result = overlay_boxes_cls_names(result, top_predictions, target)

            result = torch.from_numpy(result)
            result = result.permute(2, 0, 1)[None, :, :, :]
            result = make_grid([result])
            if tb_writer:
                tb_writer.add_image('Image_train', result, iteration)
            synchronize()
            model.train()
            vis_num += 1
            vis_num %= len(data_loader_val.dataset)
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

            # eval
            model.eval()
            results = run_test(cfg,
                               model,
                               distributed,
                               iter=iteration,
                               valid=True)
            if tb_writer:
                for result in results:
                    for k, v in result.items():
                        tb_writer.add_scalar('valid/{}'.format(k),
                                             v,
                                             global_step=iteration)
            synchronize()
            model.train()

        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #11
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    tb_logger,
    cfg,
):
    print('111111111111111111111')
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    print('2222222222222222222222')
    logger.info("Start training")
    print('4444444444444444444444')
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    kkk = 0
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration
        #print(kkk)
        kkk += 1
        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        if cfg.SOLVER.USE_ADAM:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == (max_iter - 1):
            #print(kkk * 10000000)
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
            if is_main_process():
                for tag, value in loss_dict_reduced.items():
                    tb_logger.scalar_summary(tag, value.item(), iteration)
        if iteration % checkpoint_period == 0 and iteration > 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #12
0
class Evaler(nn.Module):
    """ engine/container for encoder, decoder and all DMM modules:
        match_layer
        feature_extractor
    """
    def __init__(self, DMM, encoder, decoder, args, dmmcfgs):
        super(Evaler, self).__init__()
        self.meters = MetricLogger(delimiter=" ")
        self.decoder = decoder
        self.encoder = encoder
        self.DMM = DMM
        if args.load_proposals and not args.load_proposals_dataset:
            logging.info('load %s' % args.pred_offline_meta)
            self.pred_offline_meta = json.load(
                open(args.pred_offline_meta, 'r'))
            if 'vidfid2index' in self.pred_offline_meta:
                self.pred_offline_meta = self.pred_offline_meta['vidfid2index']
        if args.use_gpu:
            self.encoder.cuda()
            self.decoder.cuda()
            self.DMM.cuda()
        timestr = time.strftime('%m-%d')
        model_name = args.model_name.strip('/')
        model_id = model_name.split('/')[-2] + '/epo' + model_name.split(
            '/')[-1].split('epo')[-1]
        self.eval_output_root = '%s/eval/%s/%s/L%s_%d_s%s/' % (
            args.models_root, timestr, model_id, args.eval_flag,
            args.test_image_h, args.eval_split)
        self.eval_output_root = self.eval_output_root.replace('__', '_')
        timestr = time.strftime('%m-%d-%H-%M')
        save_config_dir = '%s/save_config/%s/' % (self.eval_output_root,
                                                  timestr)
        isrank0 = args.local_rank == 0
        if isrank0:
            if not os.path.exists(save_config_dir):
                make_dir(save_config_dir)
            yaml.dump(
                args, open(os.path.join(save_config_dir, 'eval_args.yaml'),
                           'w'))
            yaml.dump(
                dmmcfgs,
                open(os.path.join(save_config_dir, 'eval_dmm_config.yaml'),
                     'w'))
        json_file = open(get_db_path(args.eval_split), 'r')
        self.seq_dir = get_img_path(args.eval_split)
        self.anno_dir = get_anno_path(args.eval_split)
        self.data = json.load(json_file)
        if isrank0: logging.info('num vid %d' % len(self.data['videos']))
        self.DMM.eval()
        self.encoder.eval()
        self.decoder.eval()

    def forward(self, batch_idx, inputs, imgs_names, targets, seq_name, args,
                proposals_input):
        """ Evaluation 
        forward a batch of clip
        """
        if args.pad_video:
            CHECK4D(targets)  # B,len,O,HW
            CHECK5D(inputs)  # B,len,O,H,W
        device_id = torch.cuda.current_device()
        if args.batch_size == 1:
            if not args.distributed: seq_name = [seq_name[device_id]]
        else:
            batch_size_device = int(args.batch_size / args.ngpus)
            if not args.distributed:
                seq_name = seq_name[device_id *
                                    batch_size_device:(1 + device_id) *
                                    batch_size_device]
        CHECKEQ(len(seq_name), len(inputs))
        njpgs_batch, img_shape, frame_names_batch = self.prepare_frame_names(
            seq_name)
        # send batch to GPU
        prev_thid_list = None
        B, nframe, O, H, W = inputs.shape
        max_length_clip = min(nframe, args.length_clip)
        for frame_idx in range(max_length_clip):
            tic = time.time()
            extra_frame = [njpgs <= frame_idx for njpgs in njpgs_batch]
            proposal_cur, predid_cur_frames = None, None
            if args.load_proposals and proposals_input is None:
                predid = [0] * len(seq_name)
                for b, seq_n in enumerate(seq_name):
                    if extra_frame[b]:
                        predid[b] = len(self.encoder.pred_offline) - 1
                    else:
                        frame_name = imgs_names[b][frame_idx]  # tuple + b name
                        predid[b] = int(
                            self.pred_offline_meta[seq_n][frame_name])
                predid_cur_frames = predid
            elif proposals_input is not None:
                proposal_cur = []
                for b in range(B):
                    if len(proposals_input[b]) > frame_idx:
                        proposal_cur.append(proposals_input[b][frame_idx])
                    else:
                        proposal_cur.append(proposals_input[b][-1])
            x = inputs[:, frame_idx]  # B,1->0,O,H,W, select 1 from clip len
            Bx, Cx, Hx, Wx = x.shape
            # targets shape: B,len,O,H*W
            # input shape:   B,len,3(Cx),H,W
            y_mask = targets[:, frame_idx][:, :, :-1].float()
            CHECKEQ(Hx * Wx, y_mask.shape[-1])
            CHECKEQ(Bx, y_mask.shape[0])
            B, O, HW = CHECK3D(y_mask)
            CHECKEQ(Bx, B)
            if frame_idx == 0:
                mask_hist = None
                tplt_dict, tplt_valid_batch, proposals = \
                   self.forward_timestep_init(args, x, y_mask, predid_cur_frames, proposal_cur)
                prev_thid_list, thid_list = None, None
                prev_mask = y_mask.view(B, O, HW)
                outs = y_mask
                init_pred_inst = y_mask.view(B, O, H, W)
                infos = {
                    'args': args,
                    'shape': img_shape,
                    'extra_frame': extra_frame,
                    'valid': tplt_valid_batch,
                    'predid': predid_cur_frames
                }
                _, prev_thid_list, _, _, _  = self.inference_timestep(infos, tplt_dict, x, y_mask, \
                    prev_thid_list=prev_thid_list, prev_mask=prev_mask, mask_hist=mask_hist, proposal_cur=proposal_cur)
            else:
                # ---- start inference of current batch ----
                infos = {
                    'args': args,
                    'shape': img_shape,
                    'extra_frame': extra_frame,
                    'valid': tplt_valid_batch,
                    'predid': predid_cur_frames
                }
                outs, thid_list, init_pred_inst, proposals, mask_hist = self.inference_timestep(
                    infos,
                    tplt_dict,
                    x,
                    y_mask,
                    prev_thid_list=prev_thid_list,
                    prev_mask=prev_mask,
                    mask_hist=mask_hist,
                    proposal_cur=proposal_cur)
                self.meters.update(ft=time.time() - tic)
                prev_mask = outs.view(B, O, HW)
                if args.only_spatial == False:
                    prev_thid_list = thid_list
                prev_mask = outs.view(B, O, HW) if frame_idx > 0 else y_mask
            # ---------------- save merged mask ----------------------
            for b in range(B):
                if extra_frame[b]: continue  # skip the extra frames
                saved_name = self.eval_output_root + 'merged/%s/%s.png' % (
                    seq_name[b], frame_names_batch[b][frame_idx])
                obj_index = tplt_valid_batch[b].sum()
                refine_mask = outs[b, :obj_index].view(-1, H * W)
                refine_bg = 1 - refine_mask.max(0)[0]
                refine_fbg = torch.cat(
                    [refine_bg.view(1, H, W),
                     refine_mask.view(-1, H, W)],
                    dim=0)
                max_v, max_i = refine_fbg.max(0)
                eval_helper.plot_scores_map(max_i.float(), saved_name)
            # ---------------- save outs to mask  ----------------------
            del outs, thid_list, x, y_mask, init_pred_inst
        if (batch_idx % 10 == 0 and args.local_rank == 0):
            logging.info('save at {}'.format(self.eval_output_root))
            logging.info(self.meters)

    def inference_timestep(self, infos, tplt_dict, x, y_mask, prev_thid_list,
                           prev_mask, mask_hist, proposal_cur):
        r""" inference for frames at current image step, 
        argument:
            infos: 'args','shape','extra_frame','valid','predid'}
                   img_shape: list, len=B, element: [h,w]
            x: shape: B,3,H W
            y_mask: B,O,H W
        return 
            init_pred_inst: BOHW, prediction from the mask branch, without refine  
            tplt_dict, 
            proposals, mask_hist_new #4,5,6,7,8    
        """
        args = infos['args']
        img_shape = infos['shape']
        extra_frame, tplt_valid_batch = infos['extra_frame'], infos['valid']
        hidden_spatial = None
        out_masks = []
        assert (isinstance(x, torch.Tensor))
        features, proposals, _ = self.encoder(
            args, x, predid_cur_frames=infos['predid'], proposals=proposal_cur)
        bone_feat = features['backbone_feature']  # B,Lev,(D,H,W);
        B, D, H, W = x.shape
        thid_list = []
        B, O, HW = CHECK3D(prev_mask)
        if mask_hist is None:
            mask_hist = prev_mask.view(B, O, H, W)
        assert ('mask' in proposals[0].fields())
        init_pred_inst, tplt_dict, match_loss, mask_hist_new \
            = self.DMM.inference(infos, proposals, bone_feat, mask_hist, tplt_dict )
        valid_num_obj_max = max(
            1, (tplt_valid_batch.sum(0) > 0).sum())  # shape: 1, O
        for t in range(0, valid_num_obj_max):
            if prev_thid_list is not None:
                hidden_temporal = prev_thid_list[t]
                if args.only_temporal:
                    hidden_spatial = None
            else:
                hidden_temporal = None
            mask_lstm = []
            maxpool = nn.MaxPool2d((2, 2), ceil_mode=True)
            prev_m_inst = torch.cat([
                prev_mask[:, t, :].view(B, 1, H * W), y_mask[:, t, :].view(
                    B, 1, H * W), init_pred_inst[:, t].view(B, 1, H * W)
            ],
                                    dim=2).view(B, 3, H,
                                                W)  # cat along new dim
            prev_m_inst = maxpool(prev_m_inst)
            for _ in range(len(features['refine_input_feat'])):
                prev_m_inst = maxpool(prev_m_inst)
                mask_lstm.append(prev_m_inst)
            mask_lstm = list(reversed(mask_lstm))
            out_mask, hidden = self.decoder(features['refine_input_feat'],
                                            mask_lstm, hidden_spatial,
                                            hidden_temporal)
            hidden_tmp = [hidden[ss][0] for ss in range(len(hidden))]
            hidden_spatial = hidden
            thid_list.append(hidden_tmp)
            upsample_match = nn.UpsamplingBilinear2d(size=(x.size()[-2],
                                                           x.size()[-1]))
            out_mask = upsample_match(out_mask)
            for b in range(B):  # should behave differently for differnet vid;
                is_template_valid_cur_b = tplt_valid_batch[b,
                                                           t]  # current batch
                if not is_template_valid_cur_b: continue
                mask_hist_new[b, t:t + 1, :, :] = torch.sigmoid(
                    out_mask[b])  # shape: B,O,H,W and B,1,H,W
            out_mask = out_mask.view(out_mask.size(0), -1)
            out_masks.append(out_mask)
            del mask_lstm, hidden_temporal, hidden_tmp, prev_m_inst, out_mask
        out_masks = torch.cat(out_masks, 1).view(out_masks[0].size(0),
                                                 len(out_masks), -1)  # B,O,HW
        outs = torch.sigmoid(out_masks)
        outs_pad = outs.new_zeros(B, O, HW)
        outs_pad[:, :valid_num_obj_max, :] = outs
        return outs_pad, thid_list, init_pred_inst, proposals, mask_hist_new

    def forward_timestep_init(self, args, x, y_mask, predid_cur_frames,
                              proposal_cur):
        features, proposals, cocoloss = self.encoder(
            args,
            x,
            predid_cur_frames=predid_cur_frames,
            proposals=proposal_cur)
        B, D, H, W = CHECK4D(x)
        tplt_valid_batch = []
        for b in range(B):
            prop, template_valid = ohw_mask2boxlist(y_mask[b].view(-1, H,
                                                                   W))  # OHW
            tplt_valid_batch.append(template_valid)  # append O
            proposals[b] = prop
        tplt_valid_batch = torch.stack(tplt_valid_batch, dim=0)
        tplt_dict = self.DMM.fill_template_dict(args, proposals, features,
                                                y_mask, tplt_valid_batch)
        return tplt_dict, tplt_valid_batch, proposals

    def prepare_frame_names(self, seq_name):
        njpgs_batch = []
        img_shape = []
        frame_names_batch = []
        for inx, seq_name_b in enumerate(seq_name):
            frame_names = np.sort(os.listdir(self.seq_dir + '/' + seq_name_b))
            frame_names = [
                os.path.splitext(os.path.basename(fullname))[0]
                for fullname in frame_names
            ]
            vid_img = np.array(
                Image.open(self.seq_dir + '/' + seq_name_b +
                           '/%s.jpg' % frame_names[0]))
            img_h, img_w, _ = vid_img.shape
            img_shape.append([img_h, img_w])
            seq_info = self.data['videos'][seq_name_b]['objects']
            frame_names_has_obj = []
            for obj_id in seq_info.keys():  # loop over all objects
                for frame_name in seq_info[obj_id]['frames']:
                    if frame_name not in frame_names_has_obj:  # add if this a new frames
                        frame_names_has_obj.append(frame_name)
            start_annotation_frame = frame_names_has_obj[0]
            id_start = frame_names.index(start_annotation_frame)
            if id_start != 0:
                logging.warning('find a video annotation not start from the first frame in ' + \
                                'rgb images :{}; {}'.format(seq_name_b,frame_names[0]))
                frame_names = frame_names[id_start:]
            frame_names_batch.append(frame_names)
            njpgs = len(frame_names)
            njpgs_batch.append(njpgs)
        return njpgs_batch, img_shape, frame_names_batch
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    disable_allreduce_for_logging,
    per_iter_start_callback_fn=None,
    per_iter_end_callback_fn=None,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    def prefetcher(load_iterator):
        prefetch_stream = torch.cuda.Stream()
        pad_batches = []

        def _prefetch():
            try:
                # I'm not sure why the trailing _ is necessary but the reference used
                # "for i, (images, targets, _) in enumerate(data_loader):" so I'll keep it.
                images, targets, _ = next(load_iterator)
            except StopIteration:
                return None, None

            with torch.cuda.stream(prefetch_stream):
                # TODO:  I'm not sure if the dataloader knows how to pin the targets' datatype.
                targets = [
                    target.to(device, non_blocking=True) for target in targets
                ]
                images = images.to(device, non_blocking=True)

            return images, targets

        next_images, next_targets = _prefetch()

        while next_images is not None:
            torch.cuda.current_stream().wait_stream(prefetch_stream)
            current_images, current_targets = next_images, next_targets
            next_images, next_targets = _prefetch()
            yield current_images, current_targets

    synchronize()
    optimizer.zero_grad()
    for iteration, (images,
                    targets) in enumerate(prefetcher(iter(data_loader)),
                                          start_iter):

        if per_iter_start_callback_fn is not None:
            per_iter_start_callback_fn(iteration=iteration)

        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        if not disable_allreduce_for_logging:
            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(loss=losses_reduced, **loss_dict_reduced)
        else:
            meters.update(loss=losses, **loss_dict)

        # optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        # with optimizer.scale_loss(losses) as scaled_losses:
        with amp.scale_loss(losses, optimizer) as scaled_losses:
            scaled_losses.backward()
        optimizer.step()
        # set_grads_to_none(model)
        optimizer.zero_grad()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0 and arguments["save_checkpoints"]:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter and arguments["save_checkpoints"]:
            checkpointer.save("model_final", **arguments)

        # per-epoch work (testing)
        if per_iter_end_callback_fn is not None:
            # Note: iteration has been incremented previously for
            # human-readable checkpoint names (i.e. 60000 instead of 59999)
            # so need to adjust again here
            early_exit = per_iter_end_callback_fn(iteration=iteration - 1)
            if early_exit:
                break

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
    if per_iter_end_callback_fn is not None:
        if early_exit:
            return True
        else:
            return False
    else:
        return None
def do_train(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments, warmup_layers, warmup_iters):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    NEED_UNFREEZE = False
    if start_iter < warmup_iters and len(warmup_layers) != 0:
        l = freeze_modules(model, lambda x: x in warmup_layers)
        logger.info(f"Warmup layers are {l}")
        NEED_UNFREEZE = True

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        if iteration > warmup_iters and NEED_UNFREEZE:
            l = freeze_modules(model, lambda x: True)
            logger.info(f"Train layer {l}")
            NEED_UNFREEZE = False

        # Clear cuda cache.
        # torch.cuda.empty_cache()  # TODO check if it helps

        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #16
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    output_dir,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    writer = SummaryWriter(log_dir=os.path.join(output_dir, 'run'))

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        # add tensorboard -- tuo
        if iteration % 20 == 0:
            for name, meter in meters.meters.items():
                if 'loss' in name:
                    writer.add_scalar(name, meter.avg, iteration)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #17
0
def do_train(
    model,
    data_loader_train,
    data_loaders_valid,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    validation_period,
    arguments,
    exp_name,
):
    logger = logging.getLogger("Training")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    tensorboard_path = os.path.join('../output/tensorboard', exp_name)
    tensorboard_logger = TensorboardXLogger(log_dir=tensorboard_path)

    max_iter = len(data_loader_train)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    # validation(model, data_loaders_valid, device, logger, tensorboard_logger, start_iter)
    for iteration, (images, targets, _) in enumerate(data_loader_train,
                                                     start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict, _ = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(total_loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        meters.update(lr=optimizer.param_groups[0]["lr"])

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        tensorboard_logger.write(meters, iteration, phase='Train')

        if iteration % (validation_period / 10) == 0 or iteration == (
                max_iter - 1):
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration % validation_period == 0 and iteration > 0:
            validation(model, data_loaders_valid, device, logger,
                       tensorboard_logger, iteration)

        if iteration % checkpoint_period == 0 and iteration > 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
    tensorboard_logger.export_to_json()
Exemple #18
0
def do_train(
    reid_model,
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    reid_model.eval()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]
        result, loss_dict = model(images, targets)
        images_reid, labels_reid = resize_to_image(images.tensors, targets,
                                                   result)
        if images_reid is None:
            # pass
            loss_dict.update(
                dict(cls_loss=torch.tensor(0).type_as(
                    loss_dict['loss_classifier'])))
            loss_dict.update(
                dict(tri_loss=torch.tensor(0).type_as(
                    loss_dict['loss_classifier'])))
        else:
            images_reid = [o.to(device) for o in images_reid]
            labels_reid = labels_reid.to(device)
            loss_dict = reid_model(images_reid, labels_reid, iteration,
                                   'train', loss_dict)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # with amp.scale_loss(losses, optimizer) as scaled_loss:
        #     scaled_loss.backward()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    # model.eval()
    # summary(model, [(3, 608, 608)])
    model.train()
    # print(model,'==============================================================================')
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # PyTorch v0.4.0
    # model = model.to(device)

    start_training_time = time.time()
    end = time.time()

    # lambda1 = lambda epoch: 10 ** np.random.uniform(0, -3)
    lambda1 = lambda iteration: get_triangular_lr(iteration, 1000, 10**
                                                  (0), 10**(0))
    lambda2 = lambda iteration: get_decay_lr(iteration, 10**(0))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda1)

    for iteration, (images, targets, _,
                    io) in enumerate(data_loader, start_iter):
        # for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        # for target in targets:
        #     print(target.get_field('rotations'), '==========')
        # print(len(targets[0]), len(targets[1]), '=========================================')
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        # print(type(targets[1]),targets[1],'===============================================')
        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
def do_train(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments, epoch_id, eval_in_train,
             eval_out_dir, eval_in_train_per_iter, iou_thresh_eval, min_loss,
             eval_aug_thickness):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info(f"Start training {epoch_id}")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    predictions_all = []
    losses_last = 100
    for iteration, batch in enumerate(data_loader, start_iter):
        fn = [os.path.basename(os.path.dirname(nm)) for nm in batch['fn']]
        if SHOW_FN:
            print(f'\t\t{fn}')

        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        batch['x'][1] = batch['x'][1].to(device)
        batch['y'] = [b.to(device) for b in batch['y']]

        loss_dict, predictions_i = model(batch['x'], batch['y'])

        if CHECK_NAN:
            any_nan = sum(torch.isnan(v.data) for v in loss_dict.values())
            if any_nan:
                print(f'\nGot nan loss:\n{fn}\n')
                import pdb
                pdb.set_trace()  # XXX BREAKPOINT
                continue

        losses = sum(loss for loss in loss_dict.values())

        if eval_in_train > 0 and epoch_id % eval_in_train == 0:
            data_id = batch['id']
            for k in range(len(data_id)):
                predictions_i[k].constants['data_id'] = data_id[k]

            predictions_i = [p.to(torch.device('cpu')) for p in predictions_i]
            [p.detach() for p in predictions_i]
            predictions_all += predictions_i

            if eval_in_train_per_iter > 0 and epoch_id % eval_in_train_per_iter == 0:
                logger.info(f'\nepoch {epoch_id}, data_id:{data_id}\n')
                eval_res_i = evaluate(dataset=data_loader.dataset,
                                      predictions=predictions_i,
                                      iou_thresh_eval=iou_thresh_eval,
                                      output_folder=eval_out_dir,
                                      box_only=False)

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        with autograd.detect_anomaly():
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 1 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        avg_loss = meters.loss.avg
        tmp_p = max(int(checkpoint_period // 10), 20)
        if iteration % tmp_p == 0 and avg_loss < min_loss:
            checkpointer.save("model_min_loss", **arguments)
            logger.info(f'\nmin loss: {avg_loss} at {iteration}\n')
            min_loss = avg_loss

        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)\n".format(
        total_time_str, total_training_time / (max_iter)))

    if eval_in_train > 0 and epoch_id % eval_in_train == 0:
        logger.info(f'\nepoch {epoch_id}\n')
        preds = down_sample_for_eval_training(predictions_all)
        eval_res = evaluate(dataset=data_loader.dataset,
                            predictions=preds,
                            iou_thresh_eval=iou_thresh_eval,
                            output_folder=eval_out_dir,
                            box_only=False,
                            epoch=epoch_id,
                            is_train=True,
                            eval_aug_thickness=eval_aug_thickness)
        pass
    return min_loss
Exemple #21
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    print(max_iter)
    start_iter = arguments["iteration"]
    print(start_iter)
    model.train()
    start_training_time = time.time()
    end = time.time()

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        #ipdb.set_trace()
        #unloader=transforms.ToPILImage()
        #showimg = unloader(images.tensors[0])
        #showimg = np.array(showimg)
        #bboxs=targets[0].bbox
        #bboxs = bboxs.numpy()
        #for bbox in bboxs:
        #    cv2.rectangle(showimg, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 4)
        #plt.imshow(showimg)
        #for mask in targets[0].extra_fields['masks'].polygons:
        #    poly = mask.polygons
        #    poly = poly[0]
        #    #poly = poly.numpy()
        #    n = len(poly)
        #    x = []
        #    y = []
        #    for i in range(int(n/2)):
        #        x.append(int(poly[i*2]))
        #        y.append(int(poly[i*2+1]))
        #    plt.plot(x,y,color="red",linewidth=2.0)
        #    #plt.scatter(x, y, color = 'red')
        #ipdb.set_trace()
        #plt.imshow(showimg)

        #plt.show()

        scheduler.step()

        #time.sleep( 1 )
        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #22
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    use_amp,
    cfg,
    dllogger,
    per_iter_end_callback_fn=None,
):
    dllogger.log(step="PARAMETER", data={"train_start": True})
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    if use_amp:
        scaler = torch.cuda.amp.GradScaler(init_scale=8192.0)
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        if use_amp:
            with torch.cuda.amp.autocast():
                loss_dict = model(images, targets)
        else:
            loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)


        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        if use_amp:        
            scaler.scale(losses).backward()
        else:
            losses.backward()

        def _take_step():
            if use_amp:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        if not cfg.SOLVER.ACCUMULATE_GRAD:
            _take_step()
        else:
            if (iteration + 1) % cfg.SOLVER.ACCUMULATE_STEPS == 0:
                for param in model.parameters():
                    if param.grad is not None:
                        param.grad.data.div_(cfg.SOLVER.ACCUMULATE_STEPS)
                _take_step()
            
        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            log_data = {"eta":eta_string, "learning_rate":optimizer.param_groups[0]["lr"],
                        "memory": torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 }
            log_data.update(meters.get_dict())
            dllogger.log(step=(iteration,), data=log_data)

        if cfg.SAVE_CHECKPOINT:
            if iteration % checkpoint_period == 0:
                checkpointer.save("model_{:07d}".format(iteration), **arguments)
            if iteration == max_iter:
                checkpointer.save("model_final", **arguments)

        # per-epoch work (testing)
        if per_iter_end_callback_fn is not None:
            early_exit = per_iter_end_callback_fn(iteration=iteration)
            if early_exit:
                break

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    dllogger.log(step=tuple(), data={"e2e_train_time": total_training_time,
                                                   "train_perf_fps": max_iter * cfg.SOLVER.IMS_PER_BATCH / total_training_time})
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info(
    "Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #23
0
def do_train(
    model,
    model_ema,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    local_rank,
    checkpoint_period,
    cfg_arg,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    meters_ema = MetricLogger(delimiter="  ")

    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    ema_decay = arguments["ema_decay"]
    loss_semi = arguments['loss_semi']
    temporal_save_path = cfg_arg["temporal_save_path"]
    model.train()
    model_ema.train()
    box_coder = BoxCoder(weights=(10., 10., 5., 5.))
    temporal_ens = {}
    start_training_time = time.time()
    end = time.time()
    labeled_database = arguments["HYPER_PARAMETERS"]['LABELED_DATABASE']
    temporal_supervised_losses = []

    for iteration, (images, targets_with_trans_info,
                    idx) in enumerate(data_loader, start_iter):
        targets = [_iter[0] for _iter in targets_with_trans_info]
        trans_info = [_iter[1] for _iter in targets_with_trans_info]

        try:
            db_idx, img_idx, idx_name, bboxes_batch = map_to_img(
                data_loader, idx)
            temporal_ens_bboxes = [
                ensemble_bboxes(_boxes, _im_sz, arguments["ANCHOR_STRIDES"],
                                arguments["HYPER_PARAMETERS"]['ENS_THRE'],
                                device)
                for _boxes, _im_sz in zip(bboxes_batch, images.image_sizes)
            ]

            img_size = [(_sz[1], _sz[0]) for _sz in images.image_sizes]
            pred_trans_info = copy.deepcopy(trans_info)
            temporal_ens_pred = []

            for i, _sz in enumerate(img_size):
                pred_trans_info[i][1] = _sz
                temporal_ens_per = [
                    trans_reverse(_temporal_ens, pred_trans_info[i]).to(device)
                    for _temporal_ens in temporal_ens_bboxes[i]
                ]
                temporal_ens_pred.append(temporal_ens_per)

            db_w = []
            for i, _db in enumerate(db_idx):
                if _db not in labeled_database:
                    _bbox = BoxList(
                        torch.zeros([1, 4]),
                        (images.image_sizes[i][1], images.image_sizes[i][0]),
                        mode="xyxy")
                    _bbox.add_field('labels', torch.ones([1]))
                    targets[i] = _bbox
                    db_w.append(0.)
                else:
                    db_w.append(1.)

            if any(len(target) < 1 for target in targets):
                logger.error(
                    f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}"
                )
                continue
            data_time = time.time() - end
            iteration = iteration + 1
            arguments["iteration"] = iteration

            images = images.to(device)
            targets = [target.to(device) for target in targets]
            update_ema_variables(model, model_ema, ema_decay, iteration)

            _loss_dict, result = model(images, targets)
            #---------------------loss masked by
            with torch.no_grad():
                _loss_dict_ema, result_ema = model_ema(images, targets)
                is_labeled_db_weight = torch.tensor(
                    db_w, dtype=torch.float32).to(device)

            loss_dict = {}
            loss_dict_ema = {}
            for _key in _loss_dict.keys():
                loss_dict[_key] = torch.sum(
                    torch.stack(_loss_dict[_key], dim=0) *
                    is_labeled_db_weight)
                loss_dict_ema[_key] = torch.sum(
                    torch.stack(_loss_dict_ema[_key], dim=0) *
                    is_labeled_db_weight)

            # loss_dict = _loss_dict
            # loss_dict_ema = _loss_dict_ema

            #result_origin = [trans_reverse(_res,_info) for _res,_info in zip(result_ema,trans_info)]
            #result_origin = predict_collect_postprocess(arguments['postprocess'],result_ema,trans_info)
            result_origin = predict_retina_postprocess(
                arguments['postprocess'], box_coder, result_ema, trans_info,
                images.image_sizes)

            # any_zeros = [_iter.bbox.shape[0] == 0 for _iter in temporal_ens_pred]
            # if any(any_zeros):
            #     loss_dict['semi_box_reg'] = torch.tensor(0,dtype=torch.float32,device=device)
            #     loss_dict['semi_cls'] = torch.tensor(0,dtype=torch.float32,device=device)
            # else:
            #     semi_loss = loss_semi(
            #         result, temporal_ens_pred)
            #     for _key in semi_loss.keys():
            #         loss_dict[_key] = torch.sum(torch.stack(semi_loss[_key],dim=0) * (1 - db_weight)) * arguments["semi_weight"]

            #balance losses
            with torch.no_grad():
                supversed_loss = (loss_dict['loss_retina_cls'] +
                                  loss_dict['loss_retina_reg']) / (
                                      np.sum(db_w) + 0.1)
            temporal_supervised_losses.append(supversed_loss)
            temporal_supervised_losses = temporal_supervised_losses[-100:]
            sup_loss = torch.stack(temporal_supervised_losses).mean()
            meters.update(sup_loss=sup_loss)

            if get_world_size() > 1:
                torch.distributed.all_reduce(
                    torch.stack(temporal_supervised_losses).mean(),
                    op=torch.distributed.ReduceOp.SUM)
            balance_weight = min(1. / (sup_loss / 0.28)**12, 1.)

            semi_loss = semi_loss_fn(
                result,
                result_ema,
                temporal_ens_pred,
                images.image_sizes,
                box_coder,
                n_cls=arguments["HYPER_PARAMETERS"]['NCLS'],
                reg_cons_w=arguments["HYPER_PARAMETERS"]['REG_CONSIST_WEIGHT'])
            semi_loss_weight = semi_weight_by_epoch(
                iteration,
                start_iter=arguments["HYPER_PARAMETERS"]['EPOCH_BATCH_NUM'] *
                arguments["HYPER_PARAMETERS"]['START_ITER'],
                rampup_length=arguments["HYPER_PARAMETERS"]['EPOCH_BATCH_NUM']
                * arguments["HYPER_PARAMETERS"]['RAMPUP_LENGTH'],
                consistence_weight=arguments["HYPER_PARAMETERS"]
                ['CONSISTENCE_WEIGHT'],
                consistence_trunc=arguments["HYPER_PARAMETERS"]
                ['MAX_CONSISTENT_LOSS'])  #semi_weight_by_epoch(iteration)
            for _key in semi_loss.keys():
                #loss_dict[_key] = torch.sum(semi_loss[_key] * (1 - is_labeled_db_weight))*semi_loss_weight*balance_weight # not used labeled
                loss_dict[_key] = torch.sum(semi_loss[_key]) * semi_loss_weight

            for i, (_id, _labeled) in enumerate(zip(idx_name, db_w)):
                # if _labeled == 1:
                #     continue
                result_dict = {
                    'iteration': iteration,
                    'result': result_origin[i]
                }
                if _id in temporal_ens.keys():
                    temporal_ens[_id].append(result_dict)
                else:
                    temporal_ens[_id] = [result_dict]

            #print('id={},{},scores={}----------{}'.format(idx_name[0],idx_name[1],result_origin[0].get_field('objectness')[:5],result_origin[1].get_field('objectness')[:5]))
            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = reduce_loss_dict(loss_dict)

            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            meters.update(loss=losses_reduced, **loss_dict_reduced)

            loss_dict_reduced_ema = reduce_loss_dict(loss_dict_ema)
            losses_reduced_ema = sum(
                loss for loss in loss_dict_reduced_ema.values())
            meters_ema.update(loss=losses_reduced_ema, **loss_dict_reduced_ema)

            optimizer.zero_grad()
            # Note: If mixed precision is not used, this ends up doing nothing
            # Otherwise apply loss scaling for mixed-precision recipe
            with amp.scale_loss(losses, optimizer) as scaled_losses:
                scaled_losses.backward()

            if not iteration < arguments["HYPER_PARAMETERS"][
                    'EPOCH_BATCH_NUM'] * arguments["HYPER_PARAMETERS"][
                        'START_ITER']:
                optimizer.step()
            #scheduler.step()

            batch_time = time.time() - end
            end = time.time()
            meters.update(time=batch_time, data=data_time)

            eta_seconds = meters.time.global_avg * (max_iter - iteration)
            eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

            if iteration % 20 == 0 or iteration == max_iter:
                logger.info(
                    meters.delimiter.join([
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "{meters_ema}",
                        "lr: {lr:.6f}",
                        "semi_w:{semi_w:2.3f}",
                        "supervised loss{sup_loss:2.3f},"
                        "balance_weight{balance_weight:2.3f},"
                        "max mem: {memory:.0f}",
                    ]).format(
                        eta=eta_string,
                        iter=iteration,
                        meters=str(meters),
                        meters_ema=str(meters_ema),
                        lr=optimizer.param_groups[0]["lr"],
                        semi_w=semi_loss_weight,
                        sup_loss=sup_loss,
                        balance_weight=balance_weight,
                        memory=torch.cuda.max_memory_allocated() / 1024.0 /
                        1024.0,
                    ))

            if (iteration - 50) % 100 == 0:
                for _key in temporal_ens.keys():
                    for _iter in temporal_ens[_key]:
                        str_folder = os.path.join(
                            temporal_save_path,
                            _key)  #"{}/{}".format(temporal_save_path,_key)
                        str_file = '{}/{}_loc{}_iter_x{:07d}.pt'.format(
                            str_folder, _key, local_rank, _iter['iteration'])
                        if not os.path.exists(str_folder):
                            os.makedirs(str_folder)
                        torch.save(_iter['result'], str_file)
                        del _iter['result']

                del temporal_ens
                temporal_ens = {}

            if iteration % checkpoint_period == 0:
                save_time = time.time()
                checkpointer.save("model_{:07d}".format(iteration),
                                  **arguments)

            if iteration == max_iter:
                checkpointer.save("model_final", **arguments)

        except Exception as e:
            print('error in file ', idx_name, img_idx)
            raise e

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #24
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    summary_writer,
    device,
    checkpoint_period,
    summary_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

        if iteration % summary_period == 0:
            summary_writer.add_image(
                'input_image',
                vutils.make_grid(images.tensors[:, [2, 1, 0]], normalize=True),
                iteration)
            summary_writer.add_scalar('learning_rate',
                                      optimizer.param_groups[0]['lr'],
                                      iteration)
            summary_writer.add_scalar(
                'model/loss_rpn_box_reg',
                loss_dict_reduced['loss_rpn_box_reg'].item(), iteration)
            summary_writer.add_scalar('model/loss_mask',
                                      loss_dict_reduced['loss_mask'].item(),
                                      iteration)
            summary_writer.add_scalar('model/loss_box_reg',
                                      loss_dict_reduced['loss_box_reg'].item(),
                                      iteration)
            summary_writer.add_scalar(
                'model/loss_classifier',
                loss_dict_reduced['loss_classifier'].item(), iteration)
            if 'loss_maskiou' in loss_dict_reduced:
                summary_writer.add_scalar(
                    'model/loss_maskiou',
                    loss_dict_reduced['loss_maskiou'].item(), iteration)
            summary_writer.add_scalar(
                'model/loss_objectness',
                loss_dict_reduced['loss_objectness'].item(), iteration)
            summary_writer.add_scalar('model/loss', losses_reduced.item(),
                                      iteration)

        iteration = iteration + 1

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #25
0
def do_train(
    cfg,
    total_model,
    data_loader,
    data_loader_val,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    arguments,
    args,
):
    if len(total_model) > 1:
        model = total_model[1]
        t_model = total_model[0]
    else:
        model = total_model[0]
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()

    start_training_time = time.time()
    end = time.time()

    iou_types = ("bbox", )
    if cfg[0].MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    if cfg[0].MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints", )
    dataset_names = cfg[0].DATASETS.TEST

    pytorch_1_1_0_or_later = is_pytorch_1_1_0_or_later()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        # in pytorch >= 1.1.0, scheduler.step() should be run after optimizer.step()
        if not pytorch_1_1_0_or_later:
            scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict, features_dict = model(images, targets)
        if len(total_model) > 1:
            with torch.no_grad():
                t_loss_dict, t_features_dict = t_model(images, targets)
            # with torch.no_grad():
            #     # teacher_model = t_model
            #     t_weight = torch.load('./weights/centermask-V-19-eSE-FPN-ms-3x.pth')
            #     t_weight = t_weight['model']
            #     new_tweight = OrderedDict()
            #     for k, v in t_weight.items():
            #         name = k[7:]  # remove `module.`
            #         new_tweight[name] = v
            #     t_model.load_state_dict(new_tweight)
            #     t_loss_dict, t_features_dict = t_model(images, targets)

        if args.loss_head:

            loss_regression = new_box_loss(t_loss_dict['loss_reg'],
                                           loss_dict['loss_reg'])
            loss_center = new_center_loss(t_loss_dict['loss_centerness'],
                                          loss_dict['loss_centerness'])
            mode = 'KL'  # mode = 'KL' or 'cross-entropy'
            loss_pixel_wise = pixel_wise_loss(features_dict['box_cls'],
                                              t_features_dict['box_cls'], mode)
            loss_head = (loss_regression + loss_center + loss_pixel_wise)
            loss_dict.setdefault('loss_head', loss_head)
            del loss_dict['loss_reg']
            del loss_dict['loss_centerness']

        if iteration > cfg[0].SOLVER.WARMUP_ITERS:
            if args.loss_correlation:
                correlation = True
                loss_corr = get_feature(t_model, model, images, targets,
                                        correlation)
                loss_dict.setdefault('loss_corr', loss_corr)
            if args.loss_featuremap:
                correlation = False
                loss_featuremap = get_feature(t_model, model, images, targets,
                                              correlation)
                loss_dict.setdefault('loss_featuremap', loss_featuremap)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if pytorch_1_1_0_or_later:
            scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0 and iteration != 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg[0],
                                 is_train=False,
                                 is_distributed=(get_world_size() > 1),
                                 is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False
                if cfg[0].MODEL.MASK_ON else cfg[0].MODEL.RPN_ONLY,
                device=cfg[0].MODEL.DEVICE,
                expected_results=cfg[0].TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg[0].TEST.
                EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=None,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val,
                                    _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    if len(loss_dict) > 1:
                        loss_dict = loss_dict[0]
                    else:
                        loss_dict = loss_dict
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join([
                    "[Validation]: ",
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #26
0
def do_train(
    cfg,
    model,
    data_loader,
    data_loader_val,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    if cfg.MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints", )
    dataset_names = cfg.DATASETS.TEST

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(
                f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}"
            )
            continue
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        # with amp.scale_loss(losses, optimizer) as scaled_losses:
        #     scaled_losses.backward()
        losses.backward()
        optimizer.step()
        scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg,
                                 is_train=False,
                                 is_distributed=(get_world_size() > 1),
                                 is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False
                if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=None,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val,
                                    _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join([
                    "[Validation]: ",
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #27
0
def test_while_train(cfg, model, distributed, logger, curr_iter, val_tags,
                     data_loader, output_folder):
    torch.cuda.empty_cache()
    logger.info("start testing while training...")

    # only the first one for test

    model.eval()
    results_dict = {}
    device = torch.device('cuda')
    cpu_device = torch.device("cpu")
    meters = MetricLogger(delimiter="  ", )

    for bid, (images, targets, image_ids, phrase_ids, sent_ids, sentence,
              precompute_bbox, precompute_score, feature_map, vocab_label_elmo,
              sent_sg, topN_box) in enumerate(tqdm(data_loader)):

        # if bid>3:
        #     break
        vocab_label_elmo = [vocab.to(device) for vocab in vocab_label_elmo]
        features_list = [feat.to(device) for feat in feature_map]

        with torch.no_grad():

            loss_dict, results = model(images, features_list, targets,
                                       phrase_ids, sentence, precompute_bbox,
                                       precompute_score, image_ids,
                                       vocab_label_elmo, sent_sg, topN_box)

            loss_dict_reduced = reduce_loss_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            meters.update(loss=losses_reduced, **loss_dict_reduced)
            # collect and move result to cpu memory
            moved_res = []

            if cfg.MODEL.VG.TWO_STAGE:

                if cfg.MODEL.RELATION_ON and cfg.MODEL.RELATION.USE_RELATION_CONST:

                    batch_gt_boxes, batch_pred_box, batch_pred_box_topN, batch_pred_box_det,\
                    batch_pred_similarity, batch_pred_similarity_topN, batch_rel_pred_similarity, batch_rel_gt_label, batch_topN_boxes, batch_reg_offset_topN, batch_rel_score_mat=results

                    for idx, each_gt_boxes in enumerate(batch_gt_boxes):
                        moved_res.append(
                            (each_gt_boxes.to(cpu_device),
                             batch_pred_box[idx].to(cpu_device),
                             batch_pred_box_topN[idx].to(cpu_device),
                             batch_pred_box_det[idx].to(cpu_device),
                             batch_pred_similarity[idx].to(cpu_device),
                             batch_pred_similarity_topN[idx].to(cpu_device),
                             batch_rel_pred_similarity[idx].to(cpu_device),
                             batch_rel_gt_label[idx].to(cpu_device),
                             batch_topN_boxes[idx].to(cpu_device),
                             batch_reg_offset_topN[idx].to(cpu_device),
                             batch_rel_score_mat[idx]))

                else:
                    batch_gt_boxes, batch_pred_box, batch_pred_box_topN, batch_pred_box_det, batch_pred_similarity = results
                    for idx, each_gt_boxes in enumerate(batch_gt_boxes):
                        moved_res.append(
                            (each_gt_boxes.to(cpu_device),
                             batch_pred_box[idx].to(cpu_device),
                             batch_pred_box_topN[idx].to(cpu_device),
                             batch_pred_box_det[idx].to(cpu_device),
                             batch_pred_similarity[idx].to(cpu_device)))

            else:
                batch_gt_boxes, batch_pred_box, batch_pred_box_det, batch_pred_similarity = results
                for idx, each_gt_boxes in enumerate(batch_gt_boxes):
                    moved_res.append(
                        (each_gt_boxes.to(cpu_device),
                         batch_pred_box[idx].to(cpu_device),
                         batch_pred_box_det[idx].to(cpu_device),
                         batch_pred_similarity[idx].to(cpu_device)))

            results_dict.update({
                img_id + '_' + sent_id: result
                for img_id, sent_id, result in zip(image_ids, sent_ids,
                                                   moved_res)
            })

    synchronize()

    (predictions,
     image_ids) = _accumulate_predictions_from_multiple_gpus(results_dict)

    if output_folder:
        with open(
                os.path.join(output_folder,
                             "predictions_{}.pkl".format(curr_iter)),
                'wb') as f:
            pickle.dump(predictions, f)
        torch.save(
            predictions,
            os.path.join(output_folder,
                         "predictions_{}.pth".format(curr_iter)))

    torch.cuda.empty_cache()
    if not is_main_process():
        return

    logger.info('Total items num is {}'.format(len(predictions)))

    # with open(os.path.join(cfg.OUTPUT_DIR, 'prediction.pkl'), 'wb') as handle:
    #     pickle.dump(predictions, handle, protocol=pickle.HIGHEST_PROTOCOL)

    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    if cfg.MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints", )
    box_only = False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY
    expected_results = cfg.TEST.EXPECTED_RESULTS
    expected_results_sigma_tol = cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL

    extra_args = dict(
        box_only=False,
        iou_types=iou_types,
        expected_results=expected_results,
        expected_results_sigma_tol=expected_results_sigma_tol,
    )

    acc, acc_topN, acc_det, acc_rel_softmax = evaluate(
        dataset=data_loader.dataset,
        predictions=predictions,
        image_ids=image_ids,
        curr_iter=curr_iter,
        output_folder=None,
        **extra_args)

    record = {val_tags[k]: v for (k, v) in meters.meters.items()}
    logger.log(TFBoardHandler_LEVEL, (record, curr_iter))
    logger.info("current accuracy is: {}".format(acc))
    logger.info("current topN accuracy is: {}".format(acc_topN))
    logger.info("current accuracy with detection score is: {}".format(acc_det))
    logger.info(
        "current rel constrain accuracy is: {}".format(acc_rel_softmax))
    logger.log(TFBoardHandler_LEVEL,
               ({
                   val_tags['acc']: acc,
                   val_tags['acc_topN']: acc_topN,
                   val_tags['acc_det']: acc_det,
                   val_tags['acc_rel_softmax']: acc_rel_softmax
               }, curr_iter))
    logger.info("test done !")
def do_da_train(model, source_data_loader, target_data_loader, optimizer,
                scheduler, checkpointer, device, checkpoint_period, arguments,
                cfg):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter=" ")
    # todo li max_iter怎么会在这里?
    max_iter = len(source_data_loader)
    start_iter = arguments["iteration"]
    # 设置为训练模式,不是直接开始训练
    model.train()
    start_training_time = time.time()
    end = time.time()
    # 和SHOT的代码相比,这个代码里面没有对source_data_loader进行一个for循环迭代,是因为zip函数实现了
    # 但是这个iteration就不是epoch了,每次iteration就是对每次从DataLoader里面出来的数据的一次迭代,并不是整个数据集的一次迭代
    for iteration, ((source_images, source_targets, idx1), (target_images, target_targets, idx2))\
            in enumerate(zip(source_data_loader, target_data_loader), start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration

        # 源数据和目标数据
        images = (source_images + target_images).to(device)
        targets = [
            target.to(device)
            for target in list(source_targets + target_targets)
        ]

        # 正向传播
        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # 反向传播
        losses.backward()
        optimizer.step()

        scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        # 迭代20次,log一次
        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter - 1:
            checkpointer.save("model_final", **arguments)
        if torch.isnan(losses_reduced).any():
            logger.critical('Loss is NaN, exiting...')
            return

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))


# DA end
Exemple #29
0
def train(cfg, local_rank, distributed, d_path=None):

    MaskDnet = MaskDiscriminator(nc=256)
    BBoxDnet = BoxDiscriminator(nc=256, ndf=64)
    Dnet = CombinedDiscriminator(MaskDnet, BBoxDnet)
    model = Mask_RCNN(cfg)
    g_rcnn = GAN_RCNN(model, Dnet)

    device = torch.device(cfg.MODEL.DEVICE)
    g_rcnn.to(device)

    g_optimizer = make_optimizer(cfg, model)
    d_optimizer = make_D_optimizer(cfg, Dnet)

    g_scheduler = make_lr_scheduler(cfg, g_optimizer)
    d_scheduler = make_lr_scheduler(cfg, d_optimizer)
    # model.BoxDnet = BBoxDnet

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, g_optimizer = amp.initialize(model, g_optimizer, opt_level=amp_opt_level)
    Dnet, d_optimizer = amp.initialize(Dnet, d_optimizer, opt_level=amp_opt_level)

    if distributed:
        g_rcnn = torch.nn.parallel.DistributedDataParallel(
                    g_rcnn, device_ids=[local_rank], output_device=local_rank,
                    # this should be removed if we update BatchNorm stats
                    broadcast_buffers=False,
                )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, g_optimizer, g_scheduler, output_dir, save_to_disk
    )

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)

    arguments.update(extra_checkpoint_data)

    d_checkpointer = DetectronCheckpointer(
        cfg, Dnet, d_optimizer, d_scheduler, output_dir, save_to_disk
    )

    if d_path:
        d_checkpointer.load(d_path, use_latest=False)

    data_loader = make_data_loader(
            cfg,
            is_train=True,
            is_distributed=distributed,
            start_iter=arguments["iteration"],
        )

    test_period = cfg.SOLVER.TEST_PERIOD
    data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    ## START TRAINING
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")

    meters = TensorboardLogger(
            log_dir=cfg.OUTPUT_DIR + "/tensorboardX",
            start_iter=arguments['iteration'],
            delimiter="  ")

    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    g_rcnn.train()
    start_training_time = time.time()
    end = time.time()

    iou_types = ("bbox",)
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm",)

    dataset_names = cfg.DATASETS.TEST

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if any(len(target) < 1 for target in targets):
            logger.error(f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" )
            continue
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        g_loss_dict, d_loss_dict = g_rcnn(images, targets)

        g_losses = sum(loss for loss in g_loss_dict.values())
        d_losses = sum(loss for loss in d_loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        g_loss_dict_reduced = reduce_loss_dict(g_loss_dict)
        g_losses_reduced = sum(loss for loss in g_loss_dict_reduced.values())
        
        d_loss_dict_reduced = reduce_loss_dict(d_loss_dict)
        d_losses_reduced = sum(loss for loss in d_loss_dict_reduced.values())
        
        meters.update(total_g_loss=g_losses_reduced, **g_loss_dict_reduced)
        meters.update(total_d_loss=d_losses_reduced, **d_loss_dict_reduced)

        g_optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(g_losses, g_optimizer) as g_scaled_losses:
            g_scaled_losses.backward()
        g_optimizer.step()
        g_scheduler.step()
        
        
        d_optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(d_losses, d_optimizer) as d_scaled_losses:
            d_scaled_losses.backward()
        d_optimizer.step()
        d_scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=g_optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
            d_checkpointer.save("dnet_{:07d}".format(iteration), **arguments)
            
        if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                # The method changes the segmentation mask format in a data loader,
                # so every time a new data loader is created:
                make_data_loader(cfg, is_train=False, is_distributed=False, is_for_period=True),
                dataset_name="[Validation]",
                iou_types=iou_types,
                box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=cfg.OUTPUT_DIR,
            )
            synchronize()
            model.train()
            with torch.no_grad():
                # Should be one image for each GPU:
                for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)):
                    images_val = images_val.to(device)
                    targets_val = [target.to(device) for target in targets_val]
                    loss_dict = model(images_val, targets_val)
                    losses = sum(loss for loss in loss_dict.values())
                    loss_dict_reduced = reduce_loss_dict(loss_dict)
                    losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                    meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            synchronize()
            logger.info(
                meters_val.delimiter.join(
                    [
                        "[Validation]: ",
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters_val),
                    lr=g_optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Exemple #30
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        # print('images: ', images)
        # print('targets: ', targets, targets[0].bbox)
        # print('targets: ', type(targets[0]), type(targets))
        data_time = time.time() - end
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        # print('images.size(): ', images.tensors.size(), images.image_sizes)
        # print('targets: ', targets)
        loss_dict = model(images=images,
                          iteration=iteration + 1,
                          targets=targets)
        # print('loss_dict: ', loss_dict)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if (iteration + 1) % 20 == 0 or (iteration + 1) == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration + 1,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if (iteration + 1) % checkpoint_period == 0 or (iteration +
                                                        1) == max_iter:
            checkpointer.save("model_{:07d}".format(iteration + 1),
                              **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Exemple #31
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    per_iter_start_callback_fn=None,
    per_iter_end_callback_fn=None,
):
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):

        if per_iter_start_callback_fn is not None:
            per_iter_start_callback_fn(iteration=iteration)

        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        losses.backward()

        optimizer.step()
        optimizer.zero_grad()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0 and arguments["save_checkpoints"]:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter and arguments["save_checkpoints"]:
            checkpointer.save("model_final", **arguments)

        # per-epoch work (testing)
        if per_iter_end_callback_fn is not None:
            # Note: iteration has been incremented previously for
            # human-readable checkpoint names (i.e. 60000 instead of 59999)
            # so need to adjust again here
            early_exit = per_iter_end_callback_fn(iteration=iteration - 1)
            if early_exit:
                break

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
    if per_iter_end_callback_fn is not None:
        if early_exit:
            return True
        else:
            return False
    else:
        return None