Example #1
0
def main():
    '''
    Main Function

    '''

    #Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    torch.cuda.empty_cache()

    if args.evaluate:
        # Early evaluation for benchmarking
        validate(val_loader, net, criterion_val, optim, epoch, writer)
        evaluate(val_loader, net)
        return

    #Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        scheduler.step()

        train(train_loader, net, criterion, optim, epoch, writer)
        validate(val_loader, net, criterion_val, optim, epoch, writer)
Example #2
0
def main():
    """
    Main Function
    """
    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args, parser)
    writer = None

    _, _, _, extra_val_loaders, _ = datasets.setup_loaders(args)

    criterion, criterion_val = loss.get_loss(args)
    criterion_aux = loss.get_loss_aux(args)
    net = network.get_net(args, criterion, criterion_aux)

    optim, scheduler = optimizer.get_optimizer(args, net)

    net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
    net = network.warp_network_in_dataparallel(net, args.local_rank)
    epoch = 0
    i = 0

    if args.snapshot:
        epoch, mean_iu = optimizer.load_weights(net, optim, scheduler,
                            args.snapshot, args.restore_optimizer)

    print("#### iteration", i)
    torch.cuda.empty_cache()
    # Main Loop
    # for epoch in range(args.start_epoch, args.max_epoch):

    for dataset, val_loader in extra_val_loaders.items():
        print("Extra validating... This won't save pth file")
        validate(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i, save_pth=False)
def setup_net(snapshot):
    """Quickly create a network for the given snapshot.
    
    Arguments:
        snapshot {string} -- Input snapshot, IE. kitti_best.pth
    
    Returns:
        [net, transform] -- PyTorch model & the image transform function.
    """
    cudnn.benchmark = False
    torch.cuda.empty_cache()

    args = Args('./save', 'network.deepv3.DeepWV3Plus', snapshot)

    assert_and_infer_cfg(args, train_mode=False)
    # get net
    net = network.get_net(args, criterion=None)
    net = torch.nn.DataParallel(net).cuda()
    print('Net built.')

    net, _ = restore_snapshot(net,
                              optimizer=None,
                              snapshot=snapshot,
                              restore_optimizer_bool=False)
    net.eval()
    print('Net restored.')

    # get data
    mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    img_transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(*mean_std)])

    return net, img_transform, args
Example #4
0
def main(_):
    parser = argparse.ArgumentParser(
        description='Classification model training')
    parser.add_argument('--config_file', type=str, default=None,
                        help='Optional config file for params')
    parser.add_argument('opts', help='see config.py for all options',
                        default=None, nargs=argparse.REMAINDER)

    args = parser.parse_args()
    if args.config_file is not None:
        cfg_from_file(args.config_file)
    if args.opts is not None:
        cfg_from_list(args.opts)

    assert_and_infer_cfg()
    print_cfg()

    os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.GPU_ID)
    logger = utils.setup_custom_logger('root')
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

    tf_config = tf.ConfigProto(device_count=dict(
        GPU=1), gpu_options=tf.GPUOptions(allow_growth=True))
    tf.enable_resource_variables()

    train(tf_config, logger)
    test(tf_config, logger)
Example #5
0
def main():

    """
    Main Function
    """

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    if args.fix_bn:
        net.apply(set_bn_eval)
        print("Fix bn for finetuning")

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level="O1")

    net = network.wrap_network_in_dataparallel(net, args.apex)
    if args.snapshot:
        optimizer.load_weights(net, optim,
                               args.snapshot, args.restore_optimizer)
    if args.evaluateF:
        assert args.snapshot is not None, "must load weights for evaluation"
        evaluate(val_loader, net, args)
        return
    torch.cuda.empty_cache()
    # Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        scheduler.step()
        train(train_loader, net, optim, epoch, writer)
        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)
        validate(val_loader, net, criterion_val,
                 optim, epoch, writer)
        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
Example #6
0
 def get_segmentation(self):
     # Get Segmentation Net
     assert_and_infer_cfg(self.opt, train_mode=False)
     self.opt.dataset_cls = cityscapes
     net = network.get_net(self.opt, criterion=None)
     net = torch.nn.DataParallel(net).cuda()
     print('Segmentation Net Built.')
     snapshot = os.path.join(os.getcwd(), os.path.dirname(__file__),
                             self.opt.snapshot)
     self.seg_net, _ = restore_snapshot(net,
                                        optimizer=None,
                                        snapshot=snapshot,
                                        restore_optimizer_bool=False)
     self.seg_net.eval()
     print('Segmentation Net Restored.')
def main():
    """
    Main Function
    """

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level="O1")

    net = network.warp_network_in_dataparallel(net, args.apex)
    if args.snapshot:
        optimizer.load_weights(net, optim, args.snapshot,
                               args.restore_optimizer)

    torch.cuda.empty_cache()
    # Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        #snapshot="/srv/beegfs02/scratch/language_vision/data/Sound_Event_Prediction/audio/semanticPred/logs/ckpt/default/Omni-network.deepv3_audioBG_Spec_diffmask_Comp_noBG_Paralleltask_depth_noSeman.DeepWV3Plus/models_depth/SOP_epoch_"+str(14)+".pth"
        #optimizer.load_weights(net, optim,
        #                       snapshot, args.restore_optimizer)
        scheduler.step()
        train(train_loader, net, optim, epoch, writer)
        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)
        validate(val_loader, net, criterion_val, optim, epoch, writer)
        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
Example #8
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    #define the NASA optimizer parameter
    iter_tot = len(train_loader) * args.max_epoch
    #    tau = args.tau_factor/sqrt(iter_tot)
    tau = 1
    net = network.get_net(args, criterion)
    k = 1
    #    optim, scheduler = get_optimizer(args, net)
    optim, scheduler = get_optimizer(args, net, tau, k)
    # Visualize feature maps
    #activation = {}
    #def get_activation(name):
    #def hook(model, input, output):
    #activation[name] = output.detach()
    #return hook

    #net.layer[0].register_forward_hook(get_activation('conv1'))
    #data, _ = dataset[0]
    #data.unsqueeze_(0)
    #output = model(data)

    #act = activation['conv1'].squeeze()
    #fig, axarr = plt.subplots(act.size(0))
    #for idx in range(act.size(0)):
    #axarr[idx].imshow(act[idx])

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:

        from thop import profile
        img = torch.randn(1, 3, 640, 640).cuda()
        mask = torch.randn(1, 1, 640, 640).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets

    if args.eval == 'test':
        validate(val_loader,
                 net,
                 criterion=None,
                 optim=None,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True,
                 testing=True,
                 grid=city)

        return 0

    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader,
                     net,
                     criterion=criterion_val,
                     optim=optim,
                     epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader,
                 net,
                 criterion=criterion_val,
                 optim=optim,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
def main():
    """
    Main Function
    """

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    #    args2 = copy.deepcopy(args)
    assert_and_infer_cfg(args)
    #    assert_and_infer_cfg(args2)
    #    args2.dataset = 'kitti_trav'
    #    print(args.dataset)
    #    print(args2.dataset)
    writer = prep_experiment(args, parser)
    #    writer = prep_experiment(args2, parser)

    # Dataset
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    #    train_loader2, val_loader2, train_obj2 = datasets.setup_loaders(args2)
    criterion, criterion_val = loss.get_loss(args, data_type='semantic')
    criterion2, criterion_val2 = loss.get_loss(args, data_type='trav')
    net = network.get_net(args, criterion, criterion2)

    #parameters list
    #    param1_lists = list(net.mod1.parameters()) + list(net.mod2.parameters()) + list(net.mod3.parameters()) + list(net.mod4.parameters()) + list(net.mod5.parameters()) + list(net.mod6.parameters()) + list(net.mod7.parameters()) + list(net.pool2.parameters()) + list(net.pool3.parameters()) + list(net.aspp.parameters()) + list(net.bot_fine.parameters()) + list(net.bot_aspp.parameters()) + list(net.final.parameters()) + [log_sigma_A]
    #    param2_lists = list(net.mod1.parameters()) + list(net.mod2.parameters()) + list(net.mod3.parameters()) + list(net.mod4.parameters()) + list(net.mod5.parameters()) + list(net.mod6.parameters()) + list(net.mod7.parameters()) + list(net.pool2.parameters()) + list(net.pool3.parameters()) + list(net.aspp.parameters()) + list(net.bot_fine.parameters()) + list(net.bot_aspp.parameters()) + list(net.final2.parameters()) + [log_sigma_B]

    #optimizers
    optim, scheduler = optimizer.get_optimizer(args, net)
    #    optim2, scheduler2 = optimizer.get_optimizer(args, param2_lists)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level="O1")

    net = network.wrap_network_in_dataparallel(net, args.apex)
    if args.snapshot:
        optimizer.load_weights(net, optim, args.snapshot, args.snapshot2,
                               args.restore_optimizer)
#        optimizer.load_weights(net, optim2,
#                               args.snapshot, args.snapshot2, args.restore_optimizer)

    torch.cuda.empty_cache()
    # Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        scheduler.step()
        train(train_loader, net, optim, epoch, writer)
        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)
#            train_loader2.sampler.set_epoch(epoch + 1)
        validate(val_loader, net, criterion_val, criterion_val2, optim, epoch,
                 writer)
        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                #                train_obj2.build_epoch(cut=True)
                if args.apex:
                    train_loader.sampler.set_num_samples()
#                    train_loader2.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
Example #10
0
def main():
    '''
    Main Function

    '''

    #Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args,parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    torch.cuda.empty_cache()

    if args.mode=="test":
        test_sv_path = args.test_sv_path
        print(f"Saving prediction {test_sv_path}")
        net.eval()
        for vi, data in enumerate(tqdm(val_loader)):
            input, mask, img_name, img_path = data
            assert len(input.size()) == 4 and len(mask.size()) == 3
            assert input.size()[2:] == mask.size()[1:]
            b, h, w = mask.size()

            batch_pixel_size = input.size(0) * input.size(2) * input.size(3)
            input, mask_cuda = input.cuda(), mask.cuda()

            with torch.no_grad():
                seg_out, edge_out = net(input)    # output = (1, 19, 713, 713)

            seg_predictions = seg_out.data.cpu().numpy()
            edge_predictions = edge_out.cpu().numpy()
            for i in range(b):
                _,file_name = os.path.split(img_path[i])
                file_name = file_name.replace("jpg","png")
                seq = img_path[i][:5]
                seg_path = os.path.join(test_sv_path,"gscnn","seg",seq)
                if not os.path.exists(seg_path):
                    os.makedirs(seg_path)
                edge_path = os.path.join(test_sv_path,"gscnn","edge",seq)
                edgenp_path = os.path.join(test_sv_path,"gscnn","edgenp",seq)
                if not os.path.exists(edge_path):
                    os.makedirs(edge_path)
                    os.makedirs(edgenp_path)
                seg_arg = np.argmax(seg_predictions[i],axis=0).astype(np.uint8)
                edge_arg = np.argmax(edge_predictions[i],axis=0).astype(np.uint8)

                seg_img = np.stack((seg_arg,seg_arg,seg_arg),axis=2)
                edge_img = np.stack((edge_arg,edge_arg,edge_arg),axis=2)
                seg_img = Image.fromarray(seg_img)
                seg_img.save(os.path.join(seg_path,file_name))
                edge_img = Image.fromarray(edge_img)
                edge_img.save(os.path.join(edge_path,file_name))
                np.save(os.path.join(edge_path,file_name.replace("png","npy")),edge_predictions[i])

        return



    if args.evaluate:
        # Early evaluation for benchmarking
        default_eval_epoch = 1
        validate(val_loader, net, criterion_val,
                 optim, default_eval_epoch, writer)
        evaluate(val_loader, net)
        return

    #Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
	# Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH  = epoch
        cfg.immutable(True)

        scheduler.step()

        train(train_loader, net, criterion, optim, epoch, writer)
        validate(val_loader, net, criterion_val,
                 optim, epoch, writer)
Example #11
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True, hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir,
                            args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume,
                                map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:
        print(str(net))
        from pytorchOpCounter.thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader, net, criterion=None, optim=None, epoch=0,
                 calc_metrics=False, dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
Example #12
0
parser.add_argument('--cv_split', type=int, default=None)
parser.add_argument('--mode', type=str, default='fine')
parser.add_argument('--split_index', type=int, default=0)
parser.add_argument('--split_count', type=int, default=1)
parser.add_argument('--num_workers', type=int, default=4)
parser.add_argument('--resume',
                    action='store_true',
                    default=False,
                    help='Resume Inference')
parser.add_argument('--batch_size',
                    type=int,
                    default=1,
                    help='Only in pooling mode')

args = parser.parse_args()
assert_and_infer_cfg(
    args, train_mode=False)  #把args传递进去之后就会进行参数的推断,这里设置了数据集相关信息以及BN类型等
#assert执行后会引入一个全局变量cfg他在config文件中的名称是 __C
args.apex = False  # No support for apex eval
cudnn.benchmark = False
mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
date_str = str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))


def sliding_window_cropping(data, scale=1.0):
    """
    Sliding Window Cropping
    Take the image and create a mapping and multiple crops
    """
    sliding_window_cropping = None
    mapping = {}
    crop_ctr = 0
Example #13
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=False,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    if args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        # validate(val_loader, net, criterion=None, optim=None, epoch=0,
        #          calc_metrics=False, dump_assets=args.dump_assets,
        #          dump_all_images=True)
        if not os.path.exists(args.result_dir + 'image_2/'):
            os.mkdir(args.result_dir + 'image_2/')
        if not os.path.exists(args.result_dir + 'image_3/'):
            os.mkdir(args.result_dir + 'image_3/')

        num_image = 7481
        for idx in tqdm(range(num_image)):
            sample_idx = "%06d" % idx
            eval_minibatch(sample_idx, "image_2/", net, args)
            eval_minibatch(sample_idx, "image_3/", net, args)

        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)
Example #14
0
    'fwavacc': 0
}

#Enable CUDNN Benchmarking optimization
torch.backends.cudnn.benchmark = True
args.world_size = 1
#Test Mode run two epochs with a few iterations of training and val
if args.test_mode:
    args.max_epoch = 2

if 'WORLD_SIZE' in os.environ:
    args.world_size = int(os.environ['WORLD_SIZE'])
    print("Total world size: ", int(os.environ['WORLD_SIZE']))

#Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
assert_and_infer_cfg(args)
writer = prep_experiment(args, parser)

default_eval_epoch = 1


def main(eval_args=None):
    '''
    Main Function

    '''
    # Parse arguments from rest_communication.py
    #args = parser.parse_args(eval_args)
    if args.snapshot == None:
        args.snapshot = "checkpoints/best_cityscapes_checkpoint.pth"
import torch
from torch.backends import cudnn
import torchvision.transforms as transforms

from options.test_options import TestOptions
import sys
sys.path.insert(0, './image_segmentation')
import network
from optimizer import restore_snapshot
from datasets import cityscapes
from config import assert_and_infer_cfg

TestOptions = TestOptions()
opt = TestOptions.parse()

assert_and_infer_cfg(opt, train_mode=False)
cudnn.benchmark = False
torch.cuda.empty_cache()

# Get segmentation Net
opt.dataset_cls = cityscapes
net = network.get_net(opt, criterion=None)
net = torch.nn.DataParallel(net).cuda()
print('Segmentation Net built.')
net, _ = restore_snapshot(net,
                          optimizer=None,
                          snapshot=opt.snapshot,
                          restore_optimizer_bool=False)
net.eval()
print('Segmentation Net Restored.')
def main():
    """
    Main Function
    """
    rank = args.rank
    cfg.GLOBAL_RANK = rank
    args.gpus = torch.cuda.device_count()
    device = torch.device("cpu")
    loc_dist = True if args.gpus > 1 else False
    loc_rank = rank % args.gpus
    args.gpu = loc_rank
    args.local_rank = loc_rank
    if loc_dist:
        device = "cuda:" + str(loc_rank)
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "19500"
        os.environ["NCCL_SOCKET_IFNAME"] = "ib"
        torch.cuda.set_device(device)
        torch.distributed.init_process_group(backend="nccl",
                                             rank=loc_rank,
                                             world_size=args.gpus)
        # torch.cuda.set_device(device)
    elif args.gpus == 1:
        args.gpus = torch.cuda.device_count()
        device = "cuda:0"
        args.local_rank = 0
        torch.cuda.set_device(device)

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    #     args.ngpu = torch.cuda.device_count()
    #     args.best_record = {'mean_iu': -1, 'epoch': 0}

    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    cwd = os.getcwd()
    sz = ht.MPI_WORLD.size
    filename = cwd + "/citys-heat-checkpoint-" + str(sz) + ".pth.tar"
    if args.resume and os.path.isfile(filename):
        checkpoint = torch.load(filename, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        logx.msg(f"Resuming from: checkpoint={args.resume}, "
                 f"epoch {args.start_epoch}, arch {args.arch}")
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        logx.msg(f"Loading weights from: checkpoint={args.snapshot}")

    net = network.get_net(args, criterion)
    net = net.to(device)
    # args.lr = (1. / args.world_size * (5 * (args.world_size - 1) / 6.)) * 0.0125 * args.world_size
    optim, scheduler = get_optimizer(args, net)

    # the scheduler in this code is only run at the end of each epoch
    # todo: make heat an option not this whole file
    # if args.heat:
    dp_optim = ht.optim.DASO(
        local_optimizer=optim,
        total_epochs=args.max_epoch,
        max_global_skips=4,
    )
    #if args.no_cycling:
    dp_optim.disable_cycling(global_skips=args.batch_skip,
                             batches_to_wait=args.gs)
    # this is where the network is wrapped with DDDP (w/apex) or DP
    htnet = ht.nn.DataParallelMultiGPU(net,
                                       comm=ht.MPI_WORLD,
                                       optimizer=dp_optim)

    if args.summary:
        print(str(net))
        from thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print0(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
        dp_optim.stability.load_dict(checkpoint["skip_stable"])
    if args.restore_net:
        #restore_net(net, checkpoint)
        htnet.load_state_dict(checkpoint["state_dict"])
        #dp_optim.module.load_state_dist(checkpoint["state_dict"])
    # htnet = ht.nn.DataParallelMultiGPU(net, ht.MPI_WORLD, dp_optim)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        # TODO: need a loss value for the restart at a certain epoch...
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    # todo: HeAT fixes -- not urgent --
    if args.eval == 'val':
        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader,
                     net,
                     criterion=criterion_val,
                     optim=optim,
                     epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader,
                 net,
                 criterion=None,
                 optim=None,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    scaler = amp.GradScaler()
    if dp_optim.comm.rank == 0:
        print("scheduler", args.lr_schedule)
    dp_optim.add_scaler(scaler)

    nodes = str(int(dp_optim.comm.size / torch.cuda.device_count()))
    cwd = os.getcwd()
    fname = cwd + "/" + nodes + "-heat-citys-benchmark"
    if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"):
        with open(fname + ".pkl", "rb") as f:
            out_dict = pickle.load(f)
    else:
        out_dict = {
            "epochs": [],
            nodes + "-avg-batch-time": [],
            nodes + "-total-train-time": [],
            nodes + "-train-loss": [],
            nodes + "-val-loss": [],
            nodes + "-val-iou": [],
            nodes + "-val-time": [],
        }
        print0("Output dict:", fname)

    for epoch in range(args.start_epoch, args.max_epoch):
        # todo: HeAT fixes -- possible conflict between processes
        update_epoch(epoch)

        if args.only_coarse:  # default: false
            train_obj.only_coarse()
            train_obj.build_epoch()
        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
            else:
                train_obj.build_epoch()
        else:
            pass

        ls, bt, btt = train(train_loader, htnet, dp_optim, epoch, scaler)
        dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True)

        # if epoch % args.val_freq == 0:
        vls, iu, vtt = validate(val_loader, htnet, criterion_val, dp_optim,
                                epoch)
        if args.lr_schedule == "plateau":
            if dp_optim.comm.rank == 0:
                print("loss", ls, 'best:',
                      scheduler.best * (1. - scheduler.threshold),
                      scheduler.num_bad_epochs)
            scheduler.step(ls)  # val_loss)
        else:
            scheduler.step()

        if args.rank == 0:
            save_checkpoint({
                "epoch": epoch + 1,
                "arch": args.arch,
                "state_dict": htnet.state_dict(),
                "optimizer": optim.state_dict(),
                "skip_stable": dp_optim.stability.get_dict()
            })

        out_dict["epochs"].append(epoch)
        out_dict[nodes + "-train-loss"].append(ls)
        out_dict[nodes + "-avg-batch-time"].append(bt)
        out_dict[nodes + "-total-train-time"].append(btt)
        out_dict[nodes + "-val-loss"].append(vls)
        out_dict[nodes + "-val-iou"].append(iu)
        out_dict[nodes + "-val-time"].append(vtt)

        if args.rank == 0:
            save_obj(out_dict, fname)

    if args.rank == 0:
        print("\nRESULTS\n")
        import pandas as pd
        df = pd.DataFrame.from_dict(out_dict).set_index("epochs")
        with pd.option_context("display.max_rows", None, "display.max_columns",
                               None):
            # more options can be specified also
            print(df)
        if args.benchmarking:
            try:
                fulldf = pd.read_csv(cwd + "/heat-bench-results.csv")
                fulldf = pd.concat([df, fulldf], axis=1)
            except FileNotFoundError:
                fulldf = df
            fulldf.to_csv(cwd + "/heat-bench-results.csv")
Example #17
0
def main():
    """
    Main Function
    """
    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)

    train_loader, val_loaders, train_obj, extra_val_loaders, covstat_val_loaders = datasets.setup_loaders(
        args)

    criterion, criterion_val = loss.get_loss(args)
    criterion_aux = loss.get_loss_aux(args)
    net = network.get_net(args, criterion, criterion_aux)

    optim, scheduler = optimizer.get_optimizer(args, net)

    net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
    net = network.warp_network_in_dataparallel(net, args.local_rank)
    epoch = 0
    i = 0

    if args.snapshot:
        epoch, mean_iu = optimizer.load_weights(net, optim, scheduler,
                                                args.snapshot,
                                                args.restore_optimizer)
        if args.restore_optimizer is True:
            iter_per_epoch = len(train_loader)
            i = iter_per_epoch * epoch
        else:
            epoch = 0

    print("#### iteration", i)
    torch.cuda.empty_cache()
    # Main Loop
    # for epoch in range(args.start_epoch, args.max_epoch):

    while i < args.max_iter:
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.ITER = i
        cfg.immutable(True)

        i = train(train_loader, net, optim, epoch, writer, scheduler,
                  args.max_iter)
        train_loader.sampler.set_epoch(epoch + 1)

        if (args.dynamic and args.use_isw and epoch % (args.cov_stat_epoch + 1) == args.cov_stat_epoch) \
           or (args.dynamic is False and args.use_isw and epoch == args.cov_stat_epoch):
            net.module.reset_mask_matrix()
            for trial in range(args.trials):
                for dataset, val_loader in covstat_val_loaders.items(
                ):  # For get the statistics of covariance
                    validate_for_cov_stat(val_loader,
                                          dataset,
                                          net,
                                          criterion_val,
                                          optim,
                                          scheduler,
                                          epoch,
                                          writer,
                                          i,
                                          save_pth=False)
                    net.module.set_mask_matrix()

        if args.local_rank == 0:
            print("Saving pth file...")
            evaluate_eval(args,
                          net,
                          optim,
                          scheduler,
                          None,
                          None, [],
                          writer,
                          epoch,
                          "None",
                          None,
                          i,
                          save_pth=True)

        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()

        epoch += 1

    # Validation after epochs
    if len(val_loaders) == 1:
        # Run validation only one time - To save models
        for dataset, val_loader in val_loaders.items():
            validate(val_loader, dataset, net, criterion_val, optim, scheduler,
                     epoch, writer, i)
    else:
        if args.local_rank == 0:
            print("Saving pth file...")
            evaluate_eval(args,
                          net,
                          optim,
                          scheduler,
                          None,
                          None, [],
                          writer,
                          epoch,
                          "None",
                          None,
                          i,
                          save_pth=True)

    for dataset, val_loader in extra_val_loaders.items():
        print("Extra validating... This won't save pth file")
        validate(val_loader,
                 dataset,
                 net,
                 criterion_val,
                 optim,
                 scheduler,
                 epoch,
                 writer,
                 i,
                 save_pth=False)
def main():
    """
    Main Function
    """
    rank = args.rank
    cfg.GLOBAL_RANK = rank
    args.gpus = torch.cuda.device_count()
    device = torch.device("cpu")
    hvd.init()

    torch.manual_seed(999999)
    #if args.cuda:
    args.cuda = True
    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())
    #torch.cuda.manual_seed(args.seed)

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)
    #print("vefore assert and infer")
    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    #     args.ngpu = torch.cuda.device_count()
    #     args.best_record = {'mean_iu': -1, 'epoch': 0}
    #print("before datasets / loss")
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    cwd = os.getcwd()
    sz = ht.MPI_WORLD.size
    filename = cwd + "/citys-hvd-checkpoint-" + str(sz) + ".pth.tar"
    if args.resume and os.path.isfile(filename):
        checkpoint = torch.load(filename, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        logx.msg(f"Resuming from: checkpoint={args.resume}, " \
                 f"epoch {args.start_epoch}, arch {args.arch}")
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        logx.msg(f"Loading weights from: checkpoint={args.snapshot}")

    # todo: HeAT fixes -- urgent -- DDDP / optim / scheduler
    net = network.get_net(args, criterion)
    # net = net.to(device)

    # todo: optim -> direct wrap after this, scheduler stays the same?
    optim, scheduler = get_optimizer(args, net)

    # if args.fp16:
    #     net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)
    compression = hvd.Compression.fp16  # if args.fp16_allreduce else hvd.Compression.none

    optim = hvd.DistributedOptimizer(
        optim,
        named_parameters=net.named_parameters(),
        compression=compression,
        backward_passes_per_step=1,  # args.batches_per_allreduce,
        op=hvd.Average,
        gradient_predivide_factor=1.0,  # args.gradient_predivide_factor)
    )
    #print("after hvd optimizer setup")

    if args.summary:
        print(str(net))
        from thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print0(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        #net.loat_state_dict(checkpoint["state_dict"])
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()
    #print("before parameter broadcasts")
    #hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    #hvd.broadcast_optimizer_state(optim, root_rank=0)

    if args.start_epoch != 0:
        # TODO: need a loss value for the restart at a certain epoch...
        scheduler.step(args.start_epoch)

    #net = net.cuda()
    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    # todo: HeAT fixes -- not urgent --
    # if args.eval == 'val':
    #     if args.dump_topn:
    #         validate_topn(val_loader, net, criterion_val, optim, 0, args)
    #     else:
    #         validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0,
    #                  dump_assets=args.dump_assets,
    #                  dump_all_images=args.dump_all_images,
    #                  calc_metrics=not args.no_metrics)
    #     return 0
    # elif args.eval == 'folder':
    #     # Using a folder for evaluation means to not calculate metrics
    #     validate(val_loader, net, criterion=None, optim=None, epoch=0,
    #              calc_metrics=False, dump_assets=args.dump_assets,
    #              dump_all_images=True)
    #     return 0
    # elif args.eval is not None:
    #     raise 'unknown eval option {}'.format(args.eval)

    scaler = None  #amp.GradScaler()
    args.amp = False  #True

    nodes = str(int(hvd.size() / torch.cuda.device_count()))
    cwd = os.getcwd()
    fname = cwd + "/" + nodes + "-hvd-citys-benchmark"
    if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"):
        with open(fname + ".pkl", "rb") as f:
            out_dict = pickle.load(f)
    else:
        out_dict = {
            "epochs": [],
            nodes + "-avg-batch-time": [],
            nodes + "-total-train-time": [],
            nodes + "-train-loss": [],
            nodes + "-val-loss": [],
            nodes + "-val-iou": [],
            nodes + "-val-time": [],
        }
        print0("Output dict:", fname)
    # train_losses, train_btimes, train_ttime = [], [], []
    # val_losses, val_iu, val_ttime = [], [], []

    for epoch in range(args.start_epoch, args.max_epoch):
        # todo: HeAT fixes -- possible conflict between processes
        update_epoch(epoch)

        if args.only_coarse:  # default: false
            train_obj.only_coarse()
            train_obj.build_epoch()
        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
            else:
                train_obj.build_epoch()
        else:
            pass

        ls, bt, btt = train(train_loader, net, optim, epoch, scaler)
        # dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True)

        # if epoch % args.val_freq == 0:
        vls, iu, vtt = validate(val_loader, net, criterion_val, optim, epoch)
        if args.lr_schedule == "plateau":
            scheduler.step(ls)  # val_loss)
        else:
            scheduler.step()

        if args.rank == 0:
            save_checkpoint({
                "epoch": epoch + 1,
                "arch": args.arch,
                "state_dict": net.state_dict(),
                "optimizer": optim.state_dict(),
                # "skip_stable": optim.stability.get_dict()
            })

        out_dict["epochs"].append(epoch)
        out_dict[nodes + "-train-loss"].append(ls)
        out_dict[nodes + "-avg-batch-time"].append(bt)
        out_dict[nodes + "-total-train-time"].append(btt)
        out_dict[nodes + "-val-loss"].append(vls)
        out_dict[nodes + "-val-iou"].append(iu)
        out_dict[nodes + "-val-time"].append(vtt)
        if args.rank == 0:
            save_obj(out_dict, fname)

    if args.rank == 0:
        print("\nRESULTS\n")
        import pandas as pd
        df = pd.DataFrame.from_dict(out_dict).set_index("epochs")
        with pd.option_context("display.max_rows", None, "display.max_columns",
                               None):
            # more options can be specified also
            print(df)
        if args.benchmarking:
            try:
                fulldf = pd.read_csv(cwd + "/hvd-bench-results.csv")
                fulldf = pd.concat([df, fulldf], axis=1)
            except FileNotFoundError:
                fulldf = df
            fulldf.to_csv(cwd + "/hvd-bench-results.csv")
Example #19
0
parser.add_argument('--cv_split', type=int, default=None)
parser.add_argument('--mode', type=str, default='fine')
parser.add_argument('--split_index', type=int, default=0)
parser.add_argument('--split_count', type=int, default=1)
parser.add_argument('--num_workers', type=int, default=4)
parser.add_argument('--resume',
                    action='store_true',
                    default=False,
                    help='Resume Inference')
parser.add_argument('--batch_size',
                    type=int,
                    default=1,
                    help='Only in pooling mode')

args = parser.parse_args()
assert_and_infer_cfg(args, train_mode=False)
args.apex = False  # No support for apex eval
cudnn.benchmark = False
mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
date_str = str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))


def sliding_window_cropping(data, scale=1.0):
    """
    Sliding Window Cropping滑动窗口裁剪
    Take the image and create a mapping and multiple crops获取图像并创建一个映射和多个裁剪
    """
    sliding_window_cropping = None
    mapping = {}
    crop_ctr = 0
    if scale < 1.0:
def convert_segmentation_model(model_name='segmentation.onnx'):

    assert_and_infer_cfg(opt, train_mode=False)
    cudnn.benchmark = False
    torch.cuda.empty_cache()

    # Get segmentation Net
    opt.dataset_cls = cityscapes
    net = network.get_net(opt, criterion=None)
    net = torch.nn.DataParallel(net).cuda()
    print('Segmentation Net built.')
    net, _ = restore_snapshot(net,
                              optimizer=None,
                              snapshot=opt.snapshot,
                              restore_optimizer_bool=False)
    net.eval()
    print('Segmentation Net Restored.')

    # Input to the model
    batch_size = 1

    x = torch.randn(batch_size, 3, 1024, 2048, requires_grad=True).cuda()
    torch_out = net(x)

    # Export the model
    torch.onnx.export(
        net.module,  # model being run
        x,  # model input (or a tuple for multiple inputs)
        model_name,  # where to save the model (can be a file or file-like object)
        export_params=
        True,  # store the trained parameter weights inside the model file
        opset_version=11,  # the ONNX version to export the model to
        do_constant_folding=
        True,  # whether to execute constant folding for optimization
        input_names=['input'],  # the model's input names
        output_names=['output'],  # the model's output names
        dynamic_axes={
            'input': {
                0: 'batch_size'
            },  # variable lenght axes
            'output': {
                0: 'batch_size'
            }
        })

    ort_session = onnxruntime.InferenceSession(model_name)

    def to_numpy(tensor):
        return tensor.detach().cpu().numpy(
        ) if tensor.requires_grad else tensor.cpu().numpy()

    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
    ort_outs = ort_session.run(None, ort_inputs)

    # compare ONNX Runtime and PyTorch results
    np.testing.assert_allclose(to_numpy(torch_out),
                               ort_outs[0],
                               rtol=1e-03,
                               atol=1e-03)

    print(
        "Exported model has been tested with ONNXRuntime, and the result looks good!"
    )