Example #1
0
def train(args, train_dataloader, valid_dataloader):

    if str(args.model).lower() == 'fcn32s':
        model = VGG16_FCN32s(n_classes=7)
    elif str(args.model).lower() == 'fcn8s':
        model = VGG16_FCN8s(n_classes=7)
    else:
        model = UNet(n_channels=3, n_classes=7)
    #model = nn.DataParallel(model, device_ids=['cuda:0','cuda:1'])
    model.to(args.device)

    # loss
    # 0.79, 0.14, 1.0, 0.73, 2.74, 1.04, 132, 0
    weight = torch.tensor([0.79, 0.14, 1.0, 0.73, 2.74, 1.04, 1.0])
    criterion = nn.CrossEntropyLoss(weight).to(args.device)

    # optim
    optimizer = optim.SGD(model.parameters(),
                          lr=1e-3,
                          momentum=0.9,
                          weight_decay=5e-4)
    if str(args.model) == 'fcn32s':
        milestones = [1, 10, 20, 50]
    elif str(args.model) == 'fcn8s':
        milestones = [1, 10, 20, 60]
    else:
        milestones = [25, 50, 80]
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=milestones, gamma=0.2)  #learning rate decay

    best_iou = 0
    for epoch in range(args.epochs):
        print(f"\tEpoch {epoch}")

        loss, acc, iou = _run_train(args, train_dataloader, model, criterion,
                                    optimizer)
        print("\t train loss:{:.5f}, acc:{:.3f}, iou:{:.2f}".format(
            loss, acc, iou))

        loss, acc, iou = _run_eval(args, valid_dataloader, model, criterion)
        print("\t valid loss:{:.5f}, acc:{:.3f}, iou:{:.2f}".format(
            loss, acc, iou))

        if epoch in milestones:
            torch.save(model.state_dict(),
                       f"./result/{epoch}_{args.model}.pth")
            print('\t [Info] save weights')
        if epoch > milestones[1] and iou > best_iou:
            best_iou = iou
            torch.save(model.state_dict(), f"./result/best_{args.model}.pth")
            print('\t [Info] save weights')
Example #2
0
def main():

    train_dataset = MHP('/root/dataset/LV-MHP-v2/train', n_classes=59)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=12,
                              shuffle=True,
                              num_workers=0)
    model = UNet(n_channels=3, n_classes=59).cuda()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    writer = tbx.SummaryWriter(log_dir="logs")

    n_epochs = 10000
    for epoch in range(n_epochs):

        train_epoch(train_loader, model, criterion, optimizer, epoch, writer)

        state = {'state_dict': model.state_dict()}
        filename = 'checkpoints/{0:05d}.pth.tar'.format(epoch)
        torch.save(state, filename)
Example #3
0
    writer = SummaryWriter(args.tensorboard)


    # --------------------------- using pre-trained params ---------------------------------- #

    # (1) get param from pre-trained model
    # from unet_3up_ab_toge.unet.unet_model import UNet as UNet_old
    # from unet_3up_ab.unet_model import UNet as UNet_old
    from step2_add_bd_branch.unet.unet_model import UNet as UNet_old
    net_old = UNet_old(n_channels=3, n_classes=1)
    net_old.load_state_dict(torch.load('../step2_add_bd_branch/step2_checkpoints/CP196.pth'))
    net_old_dict = net_old.state_dict()

    # (2) our new model
    net = UNet(n_channels=3, n_classes=1)
    net_dict = net.state_dict()

    # # (3) apply pre-trained params in new model
    net_old_dict = {k: v for k, v in net_old_dict.items() if k in net_dict}
    net_dict.update(net_old_dict)  # update params using pre-trained model
    net.load_state_dict(net_dict)  # update the model

    if have_gpu and args.gpu:
        print('Using GPU !')
        net = net.cuda()

    try:
        train_net(image_dir=args.imagedir,
                  label_dir=args.gt,
                  boundary_dir=args.bd,
                  checkpoint_dir=args.checkpoint,
def main():
    """
    Main training loop.
    """
    parser = ArgumentParser()

    parser = UNet.add_model_specific_args(parser)
    parser = Trainer.add_argparse_args(parser)

    args = parser.parse_args()

    prod = bool(os.getenv("PROD"))
    logging.getLogger(__name__).setLevel(logging.INFO)

    if prod:
        logging.info(
            "Training i production mode, disabling all debugging APIs")
        torch.autograd.set_detect_anomaly(False)
        torch.autograd.profiler.profile(enabled=False)
        torch.autograd.profiler.emit_nvtx(enabled=False)
    else:
        logging.info("Training i development mode, debugging APIs active.")
        torch.autograd.set_detect_anomaly(True)
        torch.autograd.profiler.profile(enabled=True,
                                        use_cuda=True,
                                        record_shapes=True,
                                        profile_memory=True)
        torch.autograd.profiler.emit_nvtx(enabled=True, record_shapes=True)

    model = UNet(**vars(args))

    logging.info(
        f"Network:\n"
        f"\t{model.hparams.n_channels} input channels\n"
        f"\t{model.hparams.n_classes} output channels (classes)\n"
        f'\t{"Bilinear" if model.hparams.bilinear else "Transposed conv"} upscaling'
    )

    cudnn.benchmark = True  # cudnn Autotuner
    cudnn.enabled = True  # look for optimal algorithms

    early_stop_callback = EarlyStopping(
        monitor="val_loss",
        min_delta=0.00,
        mode="min",
        patience=10 if not os.getenv("EARLY_STOP") else int(
            os.getenv("EARLY_STOP")),
        verbose=True,
    )

    lr_monitor = LearningRateMonitor()

    run_name = "{}_LR{}_BS{}_IS{}".format(
        datetime.now().strftime("%d-%m-%Y-%H-%M-%S"),
        args.lr,
        args.batch_size,
        args.image_size,
    ).replace(".", "_")

    log_folder = ("./logs" if not os.getenv("DIR_ROOT_DIR") else
                  os.getenv("DIR_ROOT_DIR"))
    if not os.path.isdir(log_folder):
        os.mkdir(log_folder)
    logger = TensorBoardLogger(log_folder, name=run_name)

    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath='./checkpoints',
        filename='unet-{epoch:02d}-{val_loss:.2f}',
        save_top_k=3,
        mode='min',
    )

    try:
        trainer = Trainer.from_argparse_args(
            args,
            gpus=-1,
            accelerator="ddp",
            plugins=DDPPlugin(find_unused_parameters=False),
            precision=16,
            auto_lr_find="learning_rate"
            if float(os.getenv("LRN_RATE")) == 0.0 else False,
            logger=logger,
            callbacks=[early_stop_callback, lr_monitor, checkpoint_callback],
            accumulate_grad_batches=1.0 if not os.getenv("ACC_GRAD") else int(
                os.getenv("ACC_GRAD")),
            gradient_clip_val=0.0 if not os.getenv("GRAD_CLIP") else float(
                os.getenv("GRAD_CLIP")),
            max_epochs=100 if not os.getenv("EPOCHS") else int(
                os.getenv("EPOCHS")),
            val_check_interval=0.1 if not os.getenv("VAL_INT_PER") else float(
                os.getenv("VAL_INT_PER")),
            default_root_dir=os.getcwd()
            if not os.getenv("DIR_ROOT_DIR") else os.getenv("DIR_ROOT_DIR"),
            fast_dev_run=True
            if os.getenv("FAST_DEV_RUN") == "True" else False,
        )
        if float(os.getenv("LRN_RATE")) == 0.0:
            trainer.tune(model)
        trainer.fit(model)
        trainer.test(model)
    except KeyboardInterrupt:
        torch.save(model.state_dict(), "INTERRUPTED.pth")
        logging.info("Saved interrupt")
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0)