コード例 #1
0
    def distribute_run(self):
        strategy = tf.distribute.MirroredStrategy()
        train_global_batch = self.args.train_batch * strategy.num_replicas_in_sync
        val_global_batch = self.args.val_batch * strategy.num_replicas_in_sync
        train_date, train_batch_num, val_data, val_batch_num = get_datasets(
            name=self.args.dataset,
            train_batch=train_global_batch,
            val_batch=val_global_batch)
        with strategy.scope():
            model = get_net(arch=self.args.arch,
                            num_layers=self.args.num_layers,
                            num_experts=self.args.num_experts,
                            num_classes=self.args.num_classes)
            model.build(input_shape=(None, 32, 32, 3))
            model.summary()

            optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr,
                                                momentum=0.9,
                                                decay=0.0001,
                                                nesterov=True)

            dis_trainer = DisTrainer(strategy=strategy,
                                     model=model,
                                     optimizer=optimizer,
                                     epochs=self.args.epochs,
                                     val_data=val_data,
                                     train_batch=self.args.train_batch,
                                     val_batch=self.args.val_batch,
                                     train_data=train_date,
                                     log_dir=self.log_dir,
                                     model_save_path=self.model_save_path,
                                     train_batch_num=train_batch_num,
                                     val_batch_num=val_batch_num)

            dis_trainer(resume=self.args.resume, val=self.args.val)
コード例 #2
0
    def run(self):
        train_date, train_batch_num, val_data, val_batch_num = get_datasets(
            name=self.args.dataset,
            train_batch=self.args.train_batch,
            val_batch=self.args.val_batch)
        model = get_net(arch=self.args.arch,
                        num_layers=self.args.num_layers,
                        num_experts=self.args.num_experts,
                        num_classes=self.args.num_classes)
        model.build(input_shape=(None, 32, 32, 3))
        model.summary()

        optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr,
                                            momentum=0.9,
                                            decay=0.0001,
                                            nesterov=True)

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          epochs=self.args.epochs,
                          val_data=val_data,
                          train_batch=self.args.train_batch,
                          val_batch=self.args.val_batch,
                          train_data=train_date,
                          log_dir=self.log_dir,
                          model_save_path=self.model_save_path,
                          train_batch_num=train_batch_num,
                          val_batch_num=val_batch_num)

        trainer(resume=self.args.resume, val=self.args.val)
コード例 #3
0
def setup_net(snapshot):
    """Quickly create a network for the given snapshot.
    
    Arguments:
        snapshot {string} -- Input snapshot, IE. kitti_best.pth
    
    Returns:
        [net, transform] -- PyTorch model & the image transform function.
    """
    cudnn.benchmark = False
    torch.cuda.empty_cache()

    args = Args('./save', 'network.deepv3.DeepWV3Plus', snapshot)

    assert_and_infer_cfg(args, train_mode=False)
    # get net
    net = network.get_net(args, criterion=None)
    net = torch.nn.DataParallel(net).cuda()
    print('Net built.')

    net, _ = restore_snapshot(net,
                              optimizer=None,
                              snapshot=snapshot,
                              restore_optimizer_bool=False)
    net.eval()
    print('Net restored.')

    # get data
    mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    img_transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(*mean_std)])

    return net, img_transform, args
コード例 #4
0
ファイル: train.py プロジェクト: DailyActie/AI_DL_NET-GSCNN
def main():
    '''
    Main Function

    '''

    #Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    torch.cuda.empty_cache()

    if args.evaluate:
        # Early evaluation for benchmarking
        validate(val_loader, net, criterion_val, optim, epoch, writer)
        evaluate(val_loader, net)
        return

    #Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        scheduler.step()

        train(train_loader, net, criterion, optim, epoch, writer)
        validate(val_loader, net, criterion_val, optim, epoch, writer)
コード例 #5
0
def main():
    """
    Main Function
    """
    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args, parser)
    writer = None

    _, _, _, extra_val_loaders, _ = datasets.setup_loaders(args)

    criterion, criterion_val = loss.get_loss(args)
    criterion_aux = loss.get_loss_aux(args)
    net = network.get_net(args, criterion, criterion_aux)

    optim, scheduler = optimizer.get_optimizer(args, net)

    net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
    net = network.warp_network_in_dataparallel(net, args.local_rank)
    epoch = 0
    i = 0

    if args.snapshot:
        epoch, mean_iu = optimizer.load_weights(net, optim, scheduler,
                            args.snapshot, args.restore_optimizer)

    print("#### iteration", i)
    torch.cuda.empty_cache()
    # Main Loop
    # for epoch in range(args.start_epoch, args.max_epoch):

    for dataset, val_loader in extra_val_loaders.items():
        print("Extra validating... This won't save pth file")
        validate(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i, save_pth=False)
コード例 #6
0
def main_worker(gpu, ngpus_per_node):
    print('Use GPU:', gpu)
    dist.init_process_group(backend='nccl',
                            init_method='tcp://localhost:23456',
                            world_size=ngpus_per_node,
                            rank=gpu)
    print('Group initialized.')

    model_dnet = network.get_net()

    saver = M.Saver(model_dnet)
    saver.restore('./model/')
    # model_dnet.bn_eps(1e-5)
    model = loss.ModelWithLoss(model_dnet)

    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    model.train()
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    print('Model get.')

    loader, sampler = datareader.get_train_dataloader(64)
    optim = torch.optim.AdamW(model.parameters(), lr=config.init_lr)

    for e in range(75, config.max_epoch):
        print('Replica:%d Epoch:%d' % (gpu, e))
        sampler.set_epoch(e)
        for i, (img, hmap) in enumerate(loader):
            # print(img.shape, hmap.shape, hmap_match.shape)
            optim.zero_grad()
            hmap_loss, outs = model(img, hmap)

            hmap_loss = hmap_loss.mean()

            loss_total = hmap_loss
            loss_total.backward()
            optim.step()
            lr = optim.param_groups[0]['lr']

            if i % 200 == 0 and gpu == 0:
                if not os.path.exists('./outputs/'):
                    os.mkdir('./outputs/')

                visutil.vis_batch(img, outs, './outputs/%d_out.jpg' % i)
                visutil.vis_batch(img, hmap, './outputs/%d_gt.jpg' % i)

            if i % 20 == 0:
                curr_time = strftime("%Y-%m-%d %H:%M:%S", gmtime())
                print('%s  Replica:%d  Progress:%d/%d  LsC:%.3e LR:%.1e' %
                      (curr_time, gpu, i, len(loader), hmap_loss, lr))

        if e in config.lr_epoch:
            newlr = lr * 0.1
            for param_group in optim.param_groups:
                param_group['lr'] = newlr

        if e % config.save_interval == 0 and gpu == 0:
            stamp = random.randint(0, 1000000)
            saver.save('./model/%d_%d.pth' % (e, stamp))
コード例 #7
0
def get_net(optimizer=None, criterion=None):
    """
    Get network for train
    :return:
    """
    net = network.get_net(args, criterion=criterion)
    net, _ = restore_snapshot(net,
                              optimizer=optimizer,
                              snapshot=args.snapshot,
                              restore_optimizer_bool=False)
    #net.train()
    return net
コード例 #8
0
def get_net():
    """
    Get Network for evaluation
    """
    logging.info('Load model file: %s', args.snapshot)
    net = network.get_net(args, criterion=None)
    net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
    net = network.warp_network_in_dataparallel(net, args.local_rank)
    net, _, _, _, _ = restore_snapshot(net, optimizer=None, scheduler=None,
                              snapshot=args.snapshot, restore_optimizer_bool=False)

    net.eval()
    return net
コード例 #9
0
def main():

    """
    Main Function
    """

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    if args.fix_bn:
        net.apply(set_bn_eval)
        print("Fix bn for finetuning")

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level="O1")

    net = network.wrap_network_in_dataparallel(net, args.apex)
    if args.snapshot:
        optimizer.load_weights(net, optim,
                               args.snapshot, args.restore_optimizer)
    if args.evaluateF:
        assert args.snapshot is not None, "must load weights for evaluation"
        evaluate(val_loader, net, args)
        return
    torch.cuda.empty_cache()
    # Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        scheduler.step()
        train(train_loader, net, optim, epoch, writer)
        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)
        validate(val_loader, net, criterion_val,
                 optim, epoch, writer)
        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
コード例 #10
0
def get_net():
    """
    Get Network for evaluation
    """
    logging.info('Load model file: %s', args.snapshot)
    net = network.get_net(args, criterion=None)
    if args.inference_mode == 'pooling':
        net = MyDataParallel(net, gather=False).cuda()
    else:
        net = torch.nn.DataParallel(net).cuda()
    net, _ = restore_snapshot(net, optimizer=None,
                              snapshot=args.snapshot, restore_optimizer_bool=False)
    net.eval()
    return net
コード例 #11
0
ファイル: estimator.py プロジェクト: repo-collection/synboost
 def get_segmentation(self):
     # Get Segmentation Net
     assert_and_infer_cfg(self.opt, train_mode=False)
     self.opt.dataset_cls = cityscapes
     net = network.get_net(self.opt, criterion=None)
     net = torch.nn.DataParallel(net).cuda()
     print('Segmentation Net Built.')
     snapshot = os.path.join(os.getcwd(), os.path.dirname(__file__),
                             self.opt.snapshot)
     self.seg_net, _ = restore_snapshot(net,
                                        optimizer=None,
                                        snapshot=snapshot,
                                        restore_optimizer_bool=False)
     self.seg_net.eval()
     print('Segmentation Net Restored.')
コード例 #12
0
def main(eval_args=None):
    '''
    Main Function

    '''
    # Parse arguments from rest_communication.py
    #args = parser.parse_args(eval_args)
    if args.snapshot == None:
        args.snapshot = "checkpoints/best_cityscapes_checkpoint.pth"

    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    net = restore_snapshot(net)
    torch.cuda.empty_cache()

    return evaluate(val_loader, net)
def main():
    """
    Main Function
    """

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level="O1")

    net = network.warp_network_in_dataparallel(net, args.apex)
    if args.snapshot:
        optimizer.load_weights(net, optim, args.snapshot,
                               args.restore_optimizer)

    torch.cuda.empty_cache()
    # Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        #snapshot="/srv/beegfs02/scratch/language_vision/data/Sound_Event_Prediction/audio/semanticPred/logs/ckpt/default/Omni-network.deepv3_audioBG_Spec_diffmask_Comp_noBG_Paralleltask_depth_noSeman.DeepWV3Plus/models_depth/SOP_epoch_"+str(14)+".pth"
        #optimizer.load_weights(net, optim,
        #                       snapshot, args.restore_optimizer)
        scheduler.step()
        train(train_loader, net, optim, epoch, writer)
        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)
        validate(val_loader, net, criterion_val, optim, epoch, writer)
        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
コード例 #14
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True, hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir,
                            args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume,
                                map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:
        print(str(net))
        from pytorchOpCounter.thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader, net, criterion=None, optim=None, epoch=0,
                 calc_metrics=False, dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
コード例 #15
0
import network
from optimizer import restore_snapshot
from datasets import cityscapes, kitti
# We only need BN layer
from config import infer_cfg, cfg

infer_cfg(train_mode=False)
cudnn.benchmark = False
torch.cuda.empty_cache()

img_dir = './test_imgs/'
# get net
arch = 'network.deepv3.DeepWV3Plus'
dataset_cls = kitti
net = network.get_net(arch, dataset_cls, criterion=None)
net = torch.nn.DataParallel(net).cuda()
print('Net built.')
ckpt_path = './kitti_best.pth'
ckpt = torch.load(ckpt_path)
net.load_state_dict(ckpt['state_dict'])
#net, _ = restore_snapshot(net, optimizer=None, snapshot=ckpt_path, restore_optimizer_bool=False)
net.eval()
print('Net restored.')

# get data
demo_img_path = os.path.join(img_dir, 'kitti-13.png')
mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
img_transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize(*mean_std)])
コード例 #16
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=False,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    if args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        # validate(val_loader, net, criterion=None, optim=None, epoch=0,
        #          calc_metrics=False, dump_assets=args.dump_assets,
        #          dump_all_images=True)
        if not os.path.exists(args.result_dir + 'image_2/'):
            os.mkdir(args.result_dir + 'image_2/')
        if not os.path.exists(args.result_dir + 'image_3/'):
            os.mkdir(args.result_dir + 'image_3/')

        num_image = 7481
        for idx in tqdm(range(num_image)):
            sample_idx = "%06d" % idx
            eval_minibatch(sample_idx, "image_2/", net, args)
            eval_minibatch(sample_idx, "image_3/", net, args)

        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)
コード例 #17
0
def convert_segmentation_model(model_name='segmentation.onnx'):

    assert_and_infer_cfg(opt, train_mode=False)
    cudnn.benchmark = False
    torch.cuda.empty_cache()

    # Get segmentation Net
    opt.dataset_cls = cityscapes
    net = network.get_net(opt, criterion=None)
    net = torch.nn.DataParallel(net).cuda()
    print('Segmentation Net built.')
    net, _ = restore_snapshot(net,
                              optimizer=None,
                              snapshot=opt.snapshot,
                              restore_optimizer_bool=False)
    net.eval()
    print('Segmentation Net Restored.')

    # Input to the model
    batch_size = 1

    x = torch.randn(batch_size, 3, 1024, 2048, requires_grad=True).cuda()
    torch_out = net(x)

    # Export the model
    torch.onnx.export(
        net.module,  # model being run
        x,  # model input (or a tuple for multiple inputs)
        model_name,  # where to save the model (can be a file or file-like object)
        export_params=
        True,  # store the trained parameter weights inside the model file
        opset_version=11,  # the ONNX version to export the model to
        do_constant_folding=
        True,  # whether to execute constant folding for optimization
        input_names=['input'],  # the model's input names
        output_names=['output'],  # the model's output names
        dynamic_axes={
            'input': {
                0: 'batch_size'
            },  # variable lenght axes
            'output': {
                0: 'batch_size'
            }
        })

    ort_session = onnxruntime.InferenceSession(model_name)

    def to_numpy(tensor):
        return tensor.detach().cpu().numpy(
        ) if tensor.requires_grad else tensor.cpu().numpy()

    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
    ort_outs = ort_session.run(None, ort_inputs)

    # compare ONNX Runtime and PyTorch results
    np.testing.assert_allclose(to_numpy(torch_out),
                               ort_outs[0],
                               rtol=1e-03,
                               atol=1e-03)

    print(
        "Exported model has been tested with ONNXRuntime, and the result looks good!"
    )
コード例 #18
0
def main():
    """
    Main Function
    """

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    #    args2 = copy.deepcopy(args)
    assert_and_infer_cfg(args)
    #    assert_and_infer_cfg(args2)
    #    args2.dataset = 'kitti_trav'
    #    print(args.dataset)
    #    print(args2.dataset)
    writer = prep_experiment(args, parser)
    #    writer = prep_experiment(args2, parser)

    # Dataset
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    #    train_loader2, val_loader2, train_obj2 = datasets.setup_loaders(args2)
    criterion, criterion_val = loss.get_loss(args, data_type='semantic')
    criterion2, criterion_val2 = loss.get_loss(args, data_type='trav')
    net = network.get_net(args, criterion, criterion2)

    #parameters list
    #    param1_lists = list(net.mod1.parameters()) + list(net.mod2.parameters()) + list(net.mod3.parameters()) + list(net.mod4.parameters()) + list(net.mod5.parameters()) + list(net.mod6.parameters()) + list(net.mod7.parameters()) + list(net.pool2.parameters()) + list(net.pool3.parameters()) + list(net.aspp.parameters()) + list(net.bot_fine.parameters()) + list(net.bot_aspp.parameters()) + list(net.final.parameters()) + [log_sigma_A]
    #    param2_lists = list(net.mod1.parameters()) + list(net.mod2.parameters()) + list(net.mod3.parameters()) + list(net.mod4.parameters()) + list(net.mod5.parameters()) + list(net.mod6.parameters()) + list(net.mod7.parameters()) + list(net.pool2.parameters()) + list(net.pool3.parameters()) + list(net.aspp.parameters()) + list(net.bot_fine.parameters()) + list(net.bot_aspp.parameters()) + list(net.final2.parameters()) + [log_sigma_B]

    #optimizers
    optim, scheduler = optimizer.get_optimizer(args, net)
    #    optim2, scheduler2 = optimizer.get_optimizer(args, param2_lists)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level="O1")

    net = network.wrap_network_in_dataparallel(net, args.apex)
    if args.snapshot:
        optimizer.load_weights(net, optim, args.snapshot, args.snapshot2,
                               args.restore_optimizer)
#        optimizer.load_weights(net, optim2,
#                               args.snapshot, args.snapshot2, args.restore_optimizer)

    torch.cuda.empty_cache()
    # Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH = epoch
        cfg.immutable(True)

        scheduler.step()
        train(train_loader, net, optim, epoch, writer)
        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)
#            train_loader2.sampler.set_epoch(epoch + 1)
        validate(val_loader, net, criterion_val, criterion_val2, optim, epoch,
                 writer)
        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                #                train_obj2.build_epoch(cut=True)
                if args.apex:
                    train_loader.sampler.set_num_samples()
#                    train_loader2.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
コード例 #19
0
# start experiment
n_pool = len(Y_tr)
n_test = len(Y_te)
print('number of labeled pool: {}'.format(NUM_INIT_LB))
print('number of unlabeled pool: {}'.format(n_pool - NUM_INIT_LB))
print('number of testing pool: {}'.format(n_test))

# generate initial labeled pool
idxs_lb = np.zeros(n_pool, dtype=bool)
idxs_tmp = np.arange(n_pool)
np.random.shuffle(idxs_tmp)
idxs_lb[idxs_tmp[:NUM_INIT_LB]] = True

# load network
net = get_net(DATA_NAME)
handler = get_handler(DATA_NAME)
strategy = RandomSampling(X_tr, Y_tr, idxs_lb, net, handler, args)

# print info
print(DATA_NAME)
print(type(strategy).__name__)

# round 0 accuracy
strategy.set_test_data(x=X_te, y=Y_te)
strategy.train()
P = strategy.predict(X_te, Y_te)
acc = np.zeros(NUM_ROUND + 1)
acc[0] = 1.0 * (torch.max(Y_te, 1)[1] == P).sum().item() / len(Y_te)
print('Round 0\ntesting accuracy {}'.format(acc[0]))
コード例 #20
0
def main():
    '''
    Main Function

    '''

    #Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args,parser)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = loss.get_loss(args)
    net = network.get_net(args, criterion)
    optim, scheduler = optimizer.get_optimizer(args, net)

    torch.cuda.empty_cache()

    if args.mode=="test":
        test_sv_path = args.test_sv_path
        print(f"Saving prediction {test_sv_path}")
        net.eval()
        for vi, data in enumerate(tqdm(val_loader)):
            input, mask, img_name, img_path = data
            assert len(input.size()) == 4 and len(mask.size()) == 3
            assert input.size()[2:] == mask.size()[1:]
            b, h, w = mask.size()

            batch_pixel_size = input.size(0) * input.size(2) * input.size(3)
            input, mask_cuda = input.cuda(), mask.cuda()

            with torch.no_grad():
                seg_out, edge_out = net(input)    # output = (1, 19, 713, 713)

            seg_predictions = seg_out.data.cpu().numpy()
            edge_predictions = edge_out.cpu().numpy()
            for i in range(b):
                _,file_name = os.path.split(img_path[i])
                file_name = file_name.replace("jpg","png")
                seq = img_path[i][:5]
                seg_path = os.path.join(test_sv_path,"gscnn","seg",seq)
                if not os.path.exists(seg_path):
                    os.makedirs(seg_path)
                edge_path = os.path.join(test_sv_path,"gscnn","edge",seq)
                edgenp_path = os.path.join(test_sv_path,"gscnn","edgenp",seq)
                if not os.path.exists(edge_path):
                    os.makedirs(edge_path)
                    os.makedirs(edgenp_path)
                seg_arg = np.argmax(seg_predictions[i],axis=0).astype(np.uint8)
                edge_arg = np.argmax(edge_predictions[i],axis=0).astype(np.uint8)

                seg_img = np.stack((seg_arg,seg_arg,seg_arg),axis=2)
                edge_img = np.stack((edge_arg,edge_arg,edge_arg),axis=2)
                seg_img = Image.fromarray(seg_img)
                seg_img.save(os.path.join(seg_path,file_name))
                edge_img = Image.fromarray(edge_img)
                edge_img.save(os.path.join(edge_path,file_name))
                np.save(os.path.join(edge_path,file_name.replace("png","npy")),edge_predictions[i])

        return



    if args.evaluate:
        # Early evaluation for benchmarking
        default_eval_epoch = 1
        validate(val_loader, net, criterion_val,
                 optim, default_eval_epoch, writer)
        evaluate(val_loader, net)
        return

    #Main Loop
    for epoch in range(args.start_epoch, args.max_epoch):
	# Update EPOCH CTR
        cfg.immutable(False)
        cfg.EPOCH  = epoch
        cfg.immutable(True)

        scheduler.step()

        train(train_loader, net, criterion, optim, epoch, writer)
        validate(val_loader, net, criterion_val,
                 optim, epoch, writer)
コード例 #21
0
from datasets import cityscapes
from config import assert_and_infer_cfg

parser = argparse.ArgumentParser(description='demo')
parser.add_argument('--demo-image', type=str, default='', help='path to demo image', required=True)
parser.add_argument('--snapshot', type=str, default='./pretrained_models/cityscapes_best_wideresnet38.pth', help='pre-trained checkpoint', required=True)
parser.add_argument('--arch', type=str, default='network.deepv3.DeepWV3Plus', help='network architecture used for inference')
parser.add_argument('--save-dir', type=str, default='./save', help='path to save your results')
args = parser.parse_args()
assert_and_infer_cfg(args, train_mode=False)
cudnn.benchmark = False
torch.cuda.empty_cache()

# get net
args.dataset_cls = cityscapes
net = network.get_net(args, criterion=None)
net = torch.nn.DataParallel(net).cuda()
print('Net built.')
net, _ = restore_snapshot(net, optimizer=None, snapshot=args.snapshot, restore_optimizer_bool=False)
net.eval()
print('Net restored.')

# get data
mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
img_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(*mean_std)])
img = Image.open(args.demo_image).convert('RGB')
img_tensor = img_transform(img)

# predict
with torch.no_grad():
    img = img_tensor.unsqueeze(0).cuda()
コード例 #22
0
def main():
    """
    Main Function
    """
    rank = args.rank
    cfg.GLOBAL_RANK = rank
    args.gpus = torch.cuda.device_count()
    device = torch.device("cpu")
    hvd.init()

    torch.manual_seed(999999)
    #if args.cuda:
    args.cuda = True
    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())
    #torch.cuda.manual_seed(args.seed)

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)
    #print("vefore assert and infer")
    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    #     args.ngpu = torch.cuda.device_count()
    #     args.best_record = {'mean_iu': -1, 'epoch': 0}
    #print("before datasets / loss")
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    cwd = os.getcwd()
    sz = ht.MPI_WORLD.size
    filename = cwd + "/citys-hvd-checkpoint-" + str(sz) + ".pth.tar"
    if args.resume and os.path.isfile(filename):
        checkpoint = torch.load(filename, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        logx.msg(f"Resuming from: checkpoint={args.resume}, " \
                 f"epoch {args.start_epoch}, arch {args.arch}")
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        logx.msg(f"Loading weights from: checkpoint={args.snapshot}")

    # todo: HeAT fixes -- urgent -- DDDP / optim / scheduler
    net = network.get_net(args, criterion)
    # net = net.to(device)

    # todo: optim -> direct wrap after this, scheduler stays the same?
    optim, scheduler = get_optimizer(args, net)

    # if args.fp16:
    #     net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)
    compression = hvd.Compression.fp16  # if args.fp16_allreduce else hvd.Compression.none

    optim = hvd.DistributedOptimizer(
        optim,
        named_parameters=net.named_parameters(),
        compression=compression,
        backward_passes_per_step=1,  # args.batches_per_allreduce,
        op=hvd.Average,
        gradient_predivide_factor=1.0,  # args.gradient_predivide_factor)
    )
    #print("after hvd optimizer setup")

    if args.summary:
        print(str(net))
        from thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print0(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        #net.loat_state_dict(checkpoint["state_dict"])
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()
    #print("before parameter broadcasts")
    #hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    #hvd.broadcast_optimizer_state(optim, root_rank=0)

    if args.start_epoch != 0:
        # TODO: need a loss value for the restart at a certain epoch...
        scheduler.step(args.start_epoch)

    #net = net.cuda()
    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    # todo: HeAT fixes -- not urgent --
    # if args.eval == 'val':
    #     if args.dump_topn:
    #         validate_topn(val_loader, net, criterion_val, optim, 0, args)
    #     else:
    #         validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0,
    #                  dump_assets=args.dump_assets,
    #                  dump_all_images=args.dump_all_images,
    #                  calc_metrics=not args.no_metrics)
    #     return 0
    # elif args.eval == 'folder':
    #     # Using a folder for evaluation means to not calculate metrics
    #     validate(val_loader, net, criterion=None, optim=None, epoch=0,
    #              calc_metrics=False, dump_assets=args.dump_assets,
    #              dump_all_images=True)
    #     return 0
    # elif args.eval is not None:
    #     raise 'unknown eval option {}'.format(args.eval)

    scaler = None  #amp.GradScaler()
    args.amp = False  #True

    nodes = str(int(hvd.size() / torch.cuda.device_count()))
    cwd = os.getcwd()
    fname = cwd + "/" + nodes + "-hvd-citys-benchmark"
    if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"):
        with open(fname + ".pkl", "rb") as f:
            out_dict = pickle.load(f)
    else:
        out_dict = {
            "epochs": [],
            nodes + "-avg-batch-time": [],
            nodes + "-total-train-time": [],
            nodes + "-train-loss": [],
            nodes + "-val-loss": [],
            nodes + "-val-iou": [],
            nodes + "-val-time": [],
        }
        print0("Output dict:", fname)
    # train_losses, train_btimes, train_ttime = [], [], []
    # val_losses, val_iu, val_ttime = [], [], []

    for epoch in range(args.start_epoch, args.max_epoch):
        # todo: HeAT fixes -- possible conflict between processes
        update_epoch(epoch)

        if args.only_coarse:  # default: false
            train_obj.only_coarse()
            train_obj.build_epoch()
        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
            else:
                train_obj.build_epoch()
        else:
            pass

        ls, bt, btt = train(train_loader, net, optim, epoch, scaler)
        # dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True)

        # if epoch % args.val_freq == 0:
        vls, iu, vtt = validate(val_loader, net, criterion_val, optim, epoch)
        if args.lr_schedule == "plateau":
            scheduler.step(ls)  # val_loss)
        else:
            scheduler.step()

        if args.rank == 0:
            save_checkpoint({
                "epoch": epoch + 1,
                "arch": args.arch,
                "state_dict": net.state_dict(),
                "optimizer": optim.state_dict(),
                # "skip_stable": optim.stability.get_dict()
            })

        out_dict["epochs"].append(epoch)
        out_dict[nodes + "-train-loss"].append(ls)
        out_dict[nodes + "-avg-batch-time"].append(bt)
        out_dict[nodes + "-total-train-time"].append(btt)
        out_dict[nodes + "-val-loss"].append(vls)
        out_dict[nodes + "-val-iou"].append(iu)
        out_dict[nodes + "-val-time"].append(vtt)
        if args.rank == 0:
            save_obj(out_dict, fname)

    if args.rank == 0:
        print("\nRESULTS\n")
        import pandas as pd
        df = pd.DataFrame.from_dict(out_dict).set_index("epochs")
        with pd.option_context("display.max_rows", None, "display.max_columns",
                               None):
            # more options can be specified also
            print(df)
        if args.benchmarking:
            try:
                fulldf = pd.read_csv(cwd + "/hvd-bench-results.csv")
                fulldf = pd.concat([df, fulldf], axis=1)
            except FileNotFoundError:
                fulldf = df
            fulldf.to_csv(cwd + "/hvd-bench-results.csv")
コード例 #23
0
sys.path.insert(0, './image_segmentation')
import network
from optimizer import restore_snapshot
from datasets import cityscapes
from config import assert_and_infer_cfg

TestOptions = TestOptions()
opt = TestOptions.parse()

assert_and_infer_cfg(opt, train_mode=False)
cudnn.benchmark = False
torch.cuda.empty_cache()

# Get segmentation Net
opt.dataset_cls = cityscapes
net = network.get_net(opt, criterion=None)
net = torch.nn.DataParallel(net).cuda()
print('Segmentation Net built.')
net, _ = restore_snapshot(net,
                          optimizer=None,
                          snapshot=opt.snapshot,
                          restore_optimizer_bool=False)
net.eval()
print('Segmentation Net Restored.')

# Get RGB Original Images
data_dir = opt.demo_folder
images = os.listdir(data_dir)
if len(images) == 0:
    print('There are no images at directory %s. Check the data path.' %
          (data_dir))
コード例 #24
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    #define the NASA optimizer parameter
    iter_tot = len(train_loader) * args.max_epoch
    #    tau = args.tau_factor/sqrt(iter_tot)
    tau = 1
    net = network.get_net(args, criterion)
    k = 1
    #    optim, scheduler = get_optimizer(args, net)
    optim, scheduler = get_optimizer(args, net, tau, k)
    # Visualize feature maps
    #activation = {}
    #def get_activation(name):
    #def hook(model, input, output):
    #activation[name] = output.detach()
    #return hook

    #net.layer[0].register_forward_hook(get_activation('conv1'))
    #data, _ = dataset[0]
    #data.unsqueeze_(0)
    #output = model(data)

    #act = activation['conv1'].squeeze()
    #fig, axarr = plt.subplots(act.size(0))
    #for idx in range(act.size(0)):
    #axarr[idx].imshow(act[idx])

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:

        from thop import profile
        img = torch.randn(1, 3, 640, 640).cuda()
        mask = torch.randn(1, 1, 640, 640).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets

    if args.eval == 'test':
        validate(val_loader,
                 net,
                 criterion=None,
                 optim=None,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True,
                 testing=True,
                 grid=city)

        return 0

    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader,
                     net,
                     criterion=criterion_val,
                     optim=optim,
                     epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader,
                 net,
                 criterion=criterion_val,
                 optim=optim,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
コード例 #25
0
ファイル: train.py プロジェクト: shachoi/RobustNet
def main():
    """
    Main Function
    """
    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    writer = prep_experiment(args, parser)

    train_loader, val_loaders, train_obj, extra_val_loaders, covstat_val_loaders = datasets.setup_loaders(
        args)

    criterion, criterion_val = loss.get_loss(args)
    criterion_aux = loss.get_loss_aux(args)
    net = network.get_net(args, criterion, criterion_aux)

    optim, scheduler = optimizer.get_optimizer(args, net)

    net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
    net = network.warp_network_in_dataparallel(net, args.local_rank)
    epoch = 0
    i = 0

    if args.snapshot:
        epoch, mean_iu = optimizer.load_weights(net, optim, scheduler,
                                                args.snapshot,
                                                args.restore_optimizer)
        if args.restore_optimizer is True:
            iter_per_epoch = len(train_loader)
            i = iter_per_epoch * epoch
        else:
            epoch = 0

    print("#### iteration", i)
    torch.cuda.empty_cache()
    # Main Loop
    # for epoch in range(args.start_epoch, args.max_epoch):

    while i < args.max_iter:
        # Update EPOCH CTR
        cfg.immutable(False)
        cfg.ITER = i
        cfg.immutable(True)

        i = train(train_loader, net, optim, epoch, writer, scheduler,
                  args.max_iter)
        train_loader.sampler.set_epoch(epoch + 1)

        if (args.dynamic and args.use_isw and epoch % (args.cov_stat_epoch + 1) == args.cov_stat_epoch) \
           or (args.dynamic is False and args.use_isw and epoch == args.cov_stat_epoch):
            net.module.reset_mask_matrix()
            for trial in range(args.trials):
                for dataset, val_loader in covstat_val_loaders.items(
                ):  # For get the statistics of covariance
                    validate_for_cov_stat(val_loader,
                                          dataset,
                                          net,
                                          criterion_val,
                                          optim,
                                          scheduler,
                                          epoch,
                                          writer,
                                          i,
                                          save_pth=False)
                    net.module.set_mask_matrix()

        if args.local_rank == 0:
            print("Saving pth file...")
            evaluate_eval(args,
                          net,
                          optim,
                          scheduler,
                          None,
                          None, [],
                          writer,
                          epoch,
                          "None",
                          None,
                          i,
                          save_pth=True)

        if args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.build_epoch(cut=True)
                train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()

        epoch += 1

    # Validation after epochs
    if len(val_loaders) == 1:
        # Run validation only one time - To save models
        for dataset, val_loader in val_loaders.items():
            validate(val_loader, dataset, net, criterion_val, optim, scheduler,
                     epoch, writer, i)
    else:
        if args.local_rank == 0:
            print("Saving pth file...")
            evaluate_eval(args,
                          net,
                          optim,
                          scheduler,
                          None,
                          None, [],
                          writer,
                          epoch,
                          "None",
                          None,
                          i,
                          save_pth=True)

    for dataset, val_loader in extra_val_loaders.items():
        print("Extra validating... This won't save pth file")
        validate(val_loader,
                 dataset,
                 net,
                 criterion_val,
                 optim,
                 scheduler,
                 epoch,
                 writer,
                 i,
                 save_pth=False)
コード例 #26
0
def main():
    """
    Main Function
    """
    rank = args.rank
    cfg.GLOBAL_RANK = rank
    args.gpus = torch.cuda.device_count()
    device = torch.device("cpu")
    loc_dist = True if args.gpus > 1 else False
    loc_rank = rank % args.gpus
    args.gpu = loc_rank
    args.local_rank = loc_rank
    if loc_dist:
        device = "cuda:" + str(loc_rank)
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "19500"
        os.environ["NCCL_SOCKET_IFNAME"] = "ib"
        torch.cuda.set_device(device)
        torch.distributed.init_process_group(backend="nccl",
                                             rank=loc_rank,
                                             world_size=args.gpus)
        # torch.cuda.set_device(device)
    elif args.gpus == 1:
        args.gpus = torch.cuda.device_count()
        device = "cuda:0"
        args.local_rank = 0
        torch.cuda.set_device(device)

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    #     args.ngpu = torch.cuda.device_count()
    #     args.best_record = {'mean_iu': -1, 'epoch': 0}

    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    cwd = os.getcwd()
    sz = ht.MPI_WORLD.size
    filename = cwd + "/citys-heat-checkpoint-" + str(sz) + ".pth.tar"
    if args.resume and os.path.isfile(filename):
        checkpoint = torch.load(filename, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        logx.msg(f"Resuming from: checkpoint={args.resume}, "
                 f"epoch {args.start_epoch}, arch {args.arch}")
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        logx.msg(f"Loading weights from: checkpoint={args.snapshot}")

    net = network.get_net(args, criterion)
    net = net.to(device)
    # args.lr = (1. / args.world_size * (5 * (args.world_size - 1) / 6.)) * 0.0125 * args.world_size
    optim, scheduler = get_optimizer(args, net)

    # the scheduler in this code is only run at the end of each epoch
    # todo: make heat an option not this whole file
    # if args.heat:
    dp_optim = ht.optim.DASO(
        local_optimizer=optim,
        total_epochs=args.max_epoch,
        max_global_skips=4,
    )
    #if args.no_cycling:
    dp_optim.disable_cycling(global_skips=args.batch_skip,
                             batches_to_wait=args.gs)
    # this is where the network is wrapped with DDDP (w/apex) or DP
    htnet = ht.nn.DataParallelMultiGPU(net,
                                       comm=ht.MPI_WORLD,
                                       optimizer=dp_optim)

    if args.summary:
        print(str(net))
        from thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print0(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
        dp_optim.stability.load_dict(checkpoint["skip_stable"])
    if args.restore_net:
        #restore_net(net, checkpoint)
        htnet.load_state_dict(checkpoint["state_dict"])
        #dp_optim.module.load_state_dist(checkpoint["state_dict"])
    # htnet = ht.nn.DataParallelMultiGPU(net, ht.MPI_WORLD, dp_optim)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        # TODO: need a loss value for the restart at a certain epoch...
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    # todo: HeAT fixes -- not urgent --
    if args.eval == 'val':
        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader,
                     net,
                     criterion=criterion_val,
                     optim=optim,
                     epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader,
                 net,
                 criterion=None,
                 optim=None,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    scaler = amp.GradScaler()
    if dp_optim.comm.rank == 0:
        print("scheduler", args.lr_schedule)
    dp_optim.add_scaler(scaler)

    nodes = str(int(dp_optim.comm.size / torch.cuda.device_count()))
    cwd = os.getcwd()
    fname = cwd + "/" + nodes + "-heat-citys-benchmark"
    if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"):
        with open(fname + ".pkl", "rb") as f:
            out_dict = pickle.load(f)
    else:
        out_dict = {
            "epochs": [],
            nodes + "-avg-batch-time": [],
            nodes + "-total-train-time": [],
            nodes + "-train-loss": [],
            nodes + "-val-loss": [],
            nodes + "-val-iou": [],
            nodes + "-val-time": [],
        }
        print0("Output dict:", fname)

    for epoch in range(args.start_epoch, args.max_epoch):
        # todo: HeAT fixes -- possible conflict between processes
        update_epoch(epoch)

        if args.only_coarse:  # default: false
            train_obj.only_coarse()
            train_obj.build_epoch()
        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
            else:
                train_obj.build_epoch()
        else:
            pass

        ls, bt, btt = train(train_loader, htnet, dp_optim, epoch, scaler)
        dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True)

        # if epoch % args.val_freq == 0:
        vls, iu, vtt = validate(val_loader, htnet, criterion_val, dp_optim,
                                epoch)
        if args.lr_schedule == "plateau":
            if dp_optim.comm.rank == 0:
                print("loss", ls, 'best:',
                      scheduler.best * (1. - scheduler.threshold),
                      scheduler.num_bad_epochs)
            scheduler.step(ls)  # val_loss)
        else:
            scheduler.step()

        if args.rank == 0:
            save_checkpoint({
                "epoch": epoch + 1,
                "arch": args.arch,
                "state_dict": htnet.state_dict(),
                "optimizer": optim.state_dict(),
                "skip_stable": dp_optim.stability.get_dict()
            })

        out_dict["epochs"].append(epoch)
        out_dict[nodes + "-train-loss"].append(ls)
        out_dict[nodes + "-avg-batch-time"].append(bt)
        out_dict[nodes + "-total-train-time"].append(btt)
        out_dict[nodes + "-val-loss"].append(vls)
        out_dict[nodes + "-val-iou"].append(iu)
        out_dict[nodes + "-val-time"].append(vtt)

        if args.rank == 0:
            save_obj(out_dict, fname)

    if args.rank == 0:
        print("\nRESULTS\n")
        import pandas as pd
        df = pd.DataFrame.from_dict(out_dict).set_index("epochs")
        with pd.option_context("display.max_rows", None, "display.max_columns",
                               None):
            # more options can be specified also
            print(df)
        if args.benchmarking:
            try:
                fulldf = pd.read_csv(cwd + "/heat-bench-results.csv")
                fulldf = pd.concat([df, fulldf], axis=1)
            except FileNotFoundError:
                fulldf = df
            fulldf.to_csv(cwd + "/heat-bench-results.csv")
コード例 #27
0
                    default='./save',
                    help='path to save your results')
args = parser.parse_args()
assert_and_infer_cfg(args, train_mode=False)
cudnn.benchmark = False
torch.cuda.empty_cache()

# setup logger
date_str = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))
log_dir = os.path.join(args.save_dir, "log")
os.makedirs(log_dir, exist_ok=True)
save_log('log', log_dir, date_str, rank=0)

# get net
args.dataset_cls = cityscapes
net = get_net(args, criterion=None)
net = torch.nn.DataParallel(net).cuda()
logging.info('Net built.')
net, _ = restore_snapshot(net,
                          optimizer=None,
                          snapshot=args.snapshot,
                          restore_optimizer_bool=False)
net.eval()
logging.info('Net restored.')

# get data
data_dir = args.demo_folder
images = os.listdir(data_dir)
if len(images) == 0:
    logging.info('There are no images at directory %s. Check the data path.' %
                 (data_dir))