Example #1
0
def train(args, dataloader, model):
    epoch = 1
    optimizer = optim.Adam(list(model.parameters()), lr=args.lr)
    scheduler = MultiStepLR(optimizer, milestones=LR_milestones, gamma=args.lr)

    model.train()
    for epoch in range(5000):
        for batch_idx, data in enumerate(dataloader):
            model.zero_grad()
            features = data['features'].float()
            adj_input = data['adj'].float()

            features = Variable(features).cuda()
            adj_input = Variable(adj_input).cuda()
            
            loss = model(features, adj_input)
            print('Epoch: ', epoch, ', Iter: ', batch_idx, ', Loss: ', loss)
            loss.backward()

            optimizer.step()
            scheduler.step()
            break
Example #2
0
def main(log, args=None, arglist=None):
    global image_size
    help_text = """ Collect the required arguments """
    parser = argparse.ArgumentParser(description=help_text, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-e", "--num_epochs", type=int, help="number of trainig epochs", default=20)
    parser.add_argument("-bs", "--batch_size", type=int, help="batch size", default=16)
    parser.add_argument("-ir", "--image_res", type=int, help="batch size", default=320)
    parser.add_argument("--plot_every", type=int, help="batch size", default=50)

    if not args:
        args = parser.parse_args(arglist)

    #initialize the model
    model = Net().to(device)
    log.info("Model initialization completed")
    #log.info('Data generation completed')

    #set the optimizer
    #optimizer = optim.SGD([{"params": model.conv_stn.parameters()},
    #                       {"params": model.localization.parameters(), "weight_decay": 0},
    #                       {"params": model.fc_loc.parameters(), "weight_decay": 0},
    #                       {"params": model.features.parameters()},
    #                       {"params": model.conv_last_10map.parameters()},
    #                       {"params": model.bn_last_10map.parameters()}
    #                       ], lr=1e-4, weight_decay=1e-4, momentum=0.9)
    optimizer = optim.SGD([{"params": model.stn.parameters(), "weight_decay": 0},
                           {"params": model.features.parameters()},
                           {"params": model.conv_last_10map.parameters()},
                           {"params": model.bn_last_10map.parameters()}
                           ], lr=1e-3, weight_decay=1e-4, momentum=0.9)
    scheduler = MultiStepLR(optimizer, milestones=[20, 30], gamma=0.1)
    # load the image databases
    train_imdb = cub_200("train")
    test_imdb = cub_200("test")
    image_size = args.image_res
    # rescale the ground truth according to image resolution
    log.info("Image resolution: {}".format(args.image_res))
    scale_ground_truth_boxes(train_imdb, "train", args.image_res)
    scale_ground_truth_boxes(test_imdb, "test", args.image_res)

    transform_cub_train = transforms.Compose([
        transforms.Resize((args.image_res, args.image_res)),
        # transforms.CenterCrop(512),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

    transform_cub_test = transforms.Compose([
        transforms.Resize((args.image_res, args.image_res)),
        # transforms.CenterCrop(512),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

    test_imdb1 = deepcopy(test_imdb)
    test_imdb1, indicator = filter_test_for_scale(test_imdb1)
    indicator = np.where(indicator==1)[0]
    area_hist = get_area_hist(test_imdb)
    item, count = np.unique(area_hist, return_counts=True)
    log.info("Histogram of areas")
    log.info(item)
    log.info(count)
    log.info("Train size: {}".format(len(train_imdb._image_index)))
    log.info("Test size: {}".format(len(test_imdb._image_index)))
    log.info("Test size after picking the selected scale: {}".format(len(test_imdb1._image_index)))

    for epoch in range(args.num_epochs):
        scheduler.step()
        train_model(model, train_imdb, log, optimizer, epoch+1, args.batch_size, transform_cub_train)
        test_model(model, test_imdb1, log, epoch+1, args.batch_size, area_hist, transform_cub_test, indicator)
        test_model(model, test_imdb, log, epoch+1, args.batch_size, area_hist, transform_cub_test, indicator=None)

    log.info("Training [CLASSIFICATION] accuracies")
    log.info(class_train_acc)
    log.info("\nTrain [LOC] accuracy with max selection")
    log.info(loc_train_acc_max)
    loc_train_acc = np.array(loc_train_acc_max)
    log.info(np.mean(loc_train_acc))
    log.info("\nTrain [LOC] accuracy with all boxes")
    log.info(loc_train_acc_all)
    loc_train_acc = np.array(loc_train_acc_all)
    log.info(np.mean(loc_train_acc))
    log.info("\nTrain [LOC] accuracy with max selection and NO TRANSFORM")
    log.info(loc_train_acc_max_NT)
    loc_train_acc = np.array(loc_train_acc_max_NT)
    log.info(np.mean(loc_train_acc))
    log.info("\nTrain [LOC] accuracy with all boxes and NO TRANSFORM")
    log.info(loc_train_acc_all_NT)
    loc_train_acc = np.array(loc_train_acc_all_NT)
    log.info(np.mean(loc_train_acc))

    log.info("Test [CLASSIFICATION] accuracies on [FULL SET]")
    log.info(class_test_acc1)
    log.info("\nTest [LOC] accuracy with max selection [FULL SET]")
    log.info(loc_test_acc1_max)
    loc_test_acc = np.array(loc_test_acc1_max)
    log.info(np.mean(loc_test_acc))
    log.info("\nTest [LOC] accuracy with all boxes [FULL SET]")
    log.info(loc_test_acc1_all)
    loc_test_acc = np.array(loc_test_acc1_all)
    log.info(np.mean(loc_test_acc))
    log.info("\nTest [LOC] accuracy with max selection [FULL SET] NO TRANSFORM")
    log.info(loc_test_acc1_max_NT)
    loc_test_acc = np.array(loc_test_acc1_max_NT)
    log.info(np.mean(loc_test_acc))
    log.info("\nTest [LOC] accuracy with all boxes [FULL SET] NO TRANSFORM")
    log.info(loc_test_acc1_all_NT)
    loc_test_acc = np.array(loc_test_acc1_all_NT)
    log.info(np.mean(loc_test_acc))

    log.info("Test [CLASSIFICATION] accuracies on [REDUCED SET]")
    log.info(class_test_acc2)
    log.info("\nTest [LOC] accuracy with max selection [REDUCED SET]")
    log.info(loc_test_acc2_max)
    loc_test_acc = np.array(loc_test_acc2_max)
    log.info(np.mean(loc_test_acc))
    log.info("\nTest [LOC] accuracy with all boxes [REDUCED SET]")
    log.info(loc_test_acc2_all)
    loc_test_acc = np.array(loc_test_acc2_all)
    log.info(np.mean(loc_test_acc))
    log.info("\nTest [LOC] accuracy with max selection [REDUCED SET] NO TRANSFORM")
    log.info(loc_test_acc2_max_NT)
    loc_test_acc = np.array(loc_test_acc2_max_NT)
    log.info(np.mean(loc_test_acc))
    log.info("\nTest [LOC] accuracy with all boxes [REDUCED SET] NO TRANSFORM")
    log.info(loc_test_acc2_all_NT)
    loc_test_acc = np.array(loc_test_acc2_all_NT)
    log.info(np.mean(loc_test_acc))
Example #3
0
        milestones = [int(v.strip()) for v in milestones.split(",")]
        scheduler = MultiStepLR(optimizer,
                                milestones=milestones,
                                gamma=0.1,
                                last_epoch=last_epoch)
    elif scheduler == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(optimizer, t_max, last_epoch=last_epoch)
    else:
        logging.fatal(f"Unsupported Scheduler: {scheduler}.")
        # parser.print_help(sys.stderr)
        sys.exit(1)

    logging.info(f"Start training from epoch {last_epoch + 1}.")
    for epoch in range(last_epoch + 1, num_epochs):
        scheduler.step()
        train(train_loader,
              net,
              criterion,
              optimizer,
              device=DEVICE,
              debug_steps=debug_steps,
              epoch=epoch)

        if epoch % validation_epochs == 0 or epoch == num_epochs - 1:
            val_loss, val_regression_loss, val_classification_loss = test(
                val_loader, net, criterion, DEVICE)
            logging.info(
                f"Epoch: {epoch}, " + f"Validation Loss: {val_loss:.4f}, " +
                f"Validation Regression Loss {val_regression_loss:.4f}, " +
                f"Validation Classification Loss: {val_classification_loss:.4f}"
Example #4
0
def experiment(args):
    if args.do_print:
        print(args)
    # Setup the random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    model, mixup_mat, train_loader, val_loader, test_loader = get_models(args)

    # Set up the logger and the save directories

    # If we load a mixup grid, then save in a different location. Otherwise, it's a baseline run
    if args.load_checkpoint:
        args.save_dir = os.path.join(args.save_dir, 'test_aug/')

        # args.save_loc is for model ckpts
        args.save_loc = os.path.join(args.save_dir, get_id(args),
                                     'test_aug_checkpoints/')
    else:
        args.save_dir = os.path.join(args.save_dir, 'test_no_aug/')
        # args.save_loc is for model ckpts
        args.save_loc = os.path.join(args.save_dir, get_id(args),
                                     'test_no_aug_checkpoints/')

    csv_logger, _ = load_logger(args)
    os.makedirs(args.save_loc, exist_ok=True)

    # Standard LR/schedule settings for CIFAR
    optimizer = optim.SGD(model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          nesterov=True,
                          weight_decay=5e-4)
    scheduler = MultiStepLR(optimizer, milestones=[30, 60, 80],
                            gamma=0.1)  # [60, 120, 160]

    def train_loss_func(x, y):
        x, y = x.cuda(), y.cuda()
        if args.load_checkpoint:
            mixed_x, y_a, y_b, lam = mixup_data(x, y, mixup_mat)
            pred = model(mixed_x)
            xentropy_loss = mixup_criterion(pred, y_a, y_b, lam)
        else:
            pred = model(x)
            xentropy_loss = F.cross_entropy(pred, y, reduction='none')

        final_loss = xentropy_loss.mean()
        return final_loss, pred

    def test(loader):
        model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
        correct, total = 0., 0.
        losses = []
        for images, labels in loader:
            images, labels = images.cuda(), labels.cuda()

            with torch.no_grad():
                pred = model(images)
                xentropy_loss = F.cross_entropy(pred, labels)
                losses.append(xentropy_loss.item())
                xentropy_loss = F.cross_entropy(pred, labels)
                losses.append(xentropy_loss.item())

            pred = torch.max(pred.data, 1)[1]
            total += labels.size(0)
            correct += (pred == labels).sum().item()

        avg_loss = float(np.mean(losses))
        acc = correct / total
        model.train()
        return avg_loss, acc

    init_time = time.time()
    val_loss, val_acc = test(val_loader)
    test_loss, test_acc = test(test_loader)
    if args.do_print:
        print(f"Initial Val Loss: {val_loss, val_acc}")
        print(f"Initial Test Loss: {test_loss, test_acc}")
    iteration = 0
    for epoch in range(0, args.epochs):
        reg_anneal_epoch = epoch
        xentropy_loss_avg = 0.
        total_val_loss, val_loss = 0., 0.
        correct = 0.
        total = 0.
        weight_norm, grad_norm = .0, .0

        if args.do_print:
            progress_bar = tqdm(train_loader)
        else:
            progress_bar = train_loader
        for i, (images, labels) in enumerate(progress_bar):
            if args.do_print:
                progress_bar.set_description('Epoch ' + str(epoch))

            images, labels = images.cuda(), labels.cuda()
            optimizer.zero_grad()
            xentropy_loss, pred = train_loss_func(images, labels)
            xentropy_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            xentropy_loss_avg += xentropy_loss.item()

            iteration += 1

            # Calculate running average of accuracy
            if args.do_classification:
                pred = torch.max(pred.data, 1)[1]
                total += labels.size(0)
                correct += (pred == labels.data).sum().item()
                accuracy = correct / total
            else:
                total = 1
                accuracy = 0

            if args.do_print:
                progress_bar.set_postfix(
                    train='%.4f' % (xentropy_loss_avg / (i + 1)),
                    val='%.4f' % (total_val_loss / (i + 1)),
                    acc='%.4f' % accuracy,
                    weight='%.10f' % weight_norm,
                    update='%.10f' % grad_norm)
            if i % 100 == 0:
                val_loss, val_acc = test(val_loader)
                test_loss, test_acc = test(test_loader)
                csv_logger.writerow({
                    'epoch':
                    str(epoch),
                    'train_loss':
                    str(xentropy_loss_avg / (i + 1)),
                    'train_acc':
                    str(accuracy),
                    'val_loss':
                    str(val_loss),
                    'val_acc':
                    str(val_acc),
                    'test_loss':
                    str(test_loss),
                    'test_acc':
                    str(test_acc),
                    'run_time':
                    time.time() - init_time,
                    'iteration':
                    iteration
                })
        scheduler.step(epoch)
        train_loss = xentropy_loss_avg / (i + 1)

        only_print_final_vals = not args.do_print
        if not only_print_final_vals:
            val_loss, val_acc = test(val_loader)
            # if val_acc >= 0.99 and accuracy >= 0.99 and epoch >= 50: break
            test_loss, test_acc = test(test_loader)
            tqdm.write(
                'val loss: {:6.4f} | val acc: {:6.4f} | test loss: {:6.4f} | test_acc: {:6.4f}'
                .format(val_loss, val_acc, test_loss, test_acc))

            csv_logger.writerow({
                'epoch': str(epoch),
                'train_loss': str(train_loss),
                'train_acc': str(accuracy),
                'val_loss': str(val_loss),
                'val_acc': str(val_acc),
                'test_loss': str(test_loss),
                'test_acc': str(test_acc),
                'run_time': time.time() - init_time,
                'iteration': iteration
            })
        else:
            if args.do_print:
                val_loss, val_acc = test(val_loader, do_test_augment=False)
                tqdm.write('val loss: {:6.4f} | val acc: {:6.4f}'.format(
                    val_loss, val_acc))
    val_loss, val_acc = test(val_loader)
    test_loss, test_acc = test(test_loader)
    saver(args.num_finetune_epochs, model, optimizer, args.save_loc)
    return train_loss, accuracy, val_loss, val_acc, test_loss, test_acc
Example #5
0
class Solver(object):
    """
    """
    def __init__(self, config, reuse=False):
        self.config = config
        self.reuse = reuse

        self.build_model()
        if reuse:
            # remember to manually load_data by specifying modes=[...]
            self.load_model(self.config.model_path)
        else:
            self.load_data()

    def load_data(self, modes=["train", "valid", "test"]):

        self.train_dataloader = get_dataloader(
            self.config, mode="train") if "train" in modes else None
        self.valid_dataloader = get_dataloader(
            self.config, mode="valid") if "valid" in modes else None
        self.test_dataloader = get_dataloader(
            self.config, mode="test") if "test" in modes else None

    def build_model(self):
        # can add preprocessing logic layer here
        self.model = AtecModel(self.config)

        # training stuff
        self.criterion = nn.CrossEntropyLoss()
        self.trainable_params = list(self.model.encoder.parameters()) + list(
            self.model.comparator.parameters())
        self.optimizer = Adam(self.trainable_params, lr=self.config.lr)
        self.scheduler = MultiStepLR(self.optimizer,
                                     milestones=[10, 20, 30],
                                     gamma=0.1)

        # bookkeeping stuff
        self.writer = SummaryWriter(self.config.log_dir)

    def load_model(self, model_path):
        self.model.load_state_dict(torch.load(model_path))

    def save_model(self, model, model_path):
        torch.save(model.state_dict(), model_path)

    def train(self):
        for epoch in range(self.config.num_epoch):
            self.scheduler.step()
            self.train_step(epoch)

            # save the model per epoch, only save parameters
            if (epoch + 1) % self.config.save_step == 0:
                model_path = os.path.join(self.config.model_dir,
                                          'model-%d.pkl' % (epoch + 1))
                self.save_model(self.model, model_path)

            # log model performance over epochs
            valid_acc = self.evaluate(self.valid_dataloader)
            test_acc = self.evaluate((self.test_dataloader))
            self.writer.add_scalars('data/accuracy', {
                'valid': valid_acc.data[0],
                'test': test_acc.data[0]
            }, epoch)

            print('Epoch [%d/%d], valid acc: %.4f, test acc: %.4f' %
                  (epoch + 1, self.config.num_epoch, valid_acc.data[0],
                   test_acc.data[0]))

        self.close_log(self.writer)

    def train_step(self, epoch):
        total_steps = len(
            self.train_dataloader.dataset) // self.config.batch_size + 1
        for i, (data, labels, indices,
                lengths) in enumerate(self.train_dataloader):
            logits = self.model(data, indices)
            preds = torch.argmax(logits, dim=1).long()
            loss = self.criterion(logits, labels)
            acc = self.metric(preds, labels)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # log loss, could visualize in tensorboard if needed
            if (i + 1) % self.config.log_step == 0:
                self.writer.add_scalar('data/loss', loss.data[0],
                                       epoch * total_steps + i)
                self.writer.add_scalar('data/train_acc', acc.data[0],
                                       epoch * total_steps + i)
                print('Epoch [%d/%d], Step[%d/%d], loss: %.4f, acc: %.4f' %
                      (epoch + 1, self.config.num_epoch, i + 1, total_steps,
                       loss.data[0], acc.data[0]))
            # print(acc.data[0])
            # print(loss.data[0])
    def inference(self, data, indices):
        logits = self.model(data, indices)
        preds = torch.argmax(logits, dim=1).long()
        return preds

    def evaluate(self, dataloader):
        accs = []
        for i, (data, labels, indices, lengths) in enumerate(dataloader):
            preds = self.inference(data, indices)
            acc = self.metric(preds, labels)
            accs.append(acc.data[0])
        return sum(accs) / float(len(accs))

    def metric(self, preds, labels):
        # accuracy
        res = torch.eq(preds, labels)
        acc = torch.sum(res).double() / torch.tensor(res.shape[0]).double()
        # print('-------------Good Boy--------------')
        # print(res)
        tp = 1
        fp = 1
        tn = 1
        fn = 1
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        acc_f1 = (tp + tn) / (tp + tn + fp + fn)
        f1 = 2 * precision * recall / (precision + recall)
        return acc

    def close_log(self, writer, log_path="./all_scalars.json"):
        # export scalar data to JSON for external processing
        writer.export_scalars_to_json(log_path)
        writer.close()
Example #6
0
def main():

    parser = ArgumentParser()
    parser.add_argument('--tag',
                        type=str,
                        default='run',
                        help='optional tag to identify the run')
    parser.add_argument('--dataset',
                        choices=['nuscenes', 'argoverse'],
                        default='nuscenes',
                        help='dataset to train on')
    parser.add_argument('--model',
                        choices=['pyramid', 'vpn', 'ved'],
                        default='pyramid',
                        help='model to train')
    parser.add_argument('--experiment',
                        default='test',
                        help='name of experiment config to load')
    parser.add_argument('--resume',
                        default=None,
                        help='path to an experiment to resume')
    parser.add_argument(
        '--options',
        nargs='*',
        default=[],
        help='list of addition config options as key-val pairs')
    args = parser.parse_args()

    # Load configuration
    config = get_configuration(args)

    # Create a directory for the experiment
    logdir = create_experiment(config, args.tag, args.resume)

    # Create tensorboard summary
    summary = SummaryWriter(logdir)

    # Set default device
    if len(config.gpus) > 0:
        torch.cuda.set_device(config.gpus[0])

    # Setup experiment
    model = build_model(config.model, config)
    criterion = build_criterion(config.model, config)
    train_loader, val_loader = build_dataloaders(config.train_dataset, config)

    # Build optimiser and learning rate scheduler
    optimiser = SGD(model.parameters(),
                    config.learning_rate,
                    weight_decay=config.weight_decay)
    lr_scheduler = MultiStepLR(optimiser, config.lr_milestones, 0.1)

    # Load checkpoint
    if args.resume:
        epoch, best_iou = load_checkpoint(os.path.join(logdir, 'latest.pth'),
                                          model, optimiser, lr_scheduler)
    else:
        epoch, best_iou = 1, 0
    # epoch = 1
    # Main training loop
    while epoch <= config.num_epochs:

        print('\n\n=== Beginning epoch {} of {} ==='.format(
            epoch, config.num_epochs))

        # Train model for one epoch
        train(train_loader, model, criterion, optimiser, summary, config,
              epoch)

        # Evaluate on the validation set
        val_iou = evaluate(val_loader, model, criterion, summary, config,
                           epoch)

        # Update learning rate
        lr_scheduler.step()

        # Save checkpoints
        if val_iou > best_iou:
            best_iou = val_iou
            save_checkpoint(os.path.join(logdir, 'best.pth'), model, optimiser,
                            lr_scheduler, epoch, best_iou)

        save_checkpoint(os.path.join(logdir, 'latest.pth'), model, optimiser,
                        lr_scheduler, epoch, best_iou)

        epoch += 1

    print("\nTraining complete!")
Example #7
0
def main():

    if opt.show:
        if not os.path.exists("logs/"):
            os.makedirs("logs/")

        global writer
        writer = SummaryWriter(log_dir='logs')

    if opt.cuda:
        print("=> Use GPU ID: '{}'".format(opt.gpus))
        os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpus
        if not torch.cuda.is_available():
            raise Exception(
                "No GPU found or Wrong gpu id, please run without --cuda")

    torch.manual_seed(opt.seed)
    if opt.cuda:
        torch.cuda.manual_seed(opt.seed)
    cudnn.benchmark = True

    # Loading datasets
    train_set = TrainsetFromFolder('/media/hdisk/liqiang/hyperSR/train/' +
                                   opt.datasetName + '/' +
                                   str(opt.upscale_factor) + '/')
    train_loader = DataLoader(dataset=train_set,
                              num_workers=opt.threads,
                              batch_size=opt.batchSize,
                              shuffle=True)
    val_set = ValsetFromFolder('/media/hdisk/liqiang/hyperSR/test/' +
                               opt.datasetName + '/' + str(opt.upscale_factor))
    val_loader = DataLoader(dataset=val_set,
                            num_workers=opt.threads,
                            batch_size=1,
                            shuffle=False)

    # Buliding model
    model = MCNet(opt)
    criterion = nn.L1Loss()

    if opt.cuda:
        model = nn.DataParallel(model).cuda()
        criterion = criterion.cuda()
    else:
        model = model.cpu()
    print('# parameters:', sum(param.numel() for param in model.parameters()))

    # Setting Optimizer
    optimizer = optim.Adam(model.parameters(),
                           lr=opt.lr,
                           betas=(0.9, 0.999),
                           eps=1e-08)

    # optionally resuming from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            opt.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Setting learning rate
    scheduler = MultiStepLR(optimizer,
                            milestones=[35, 70, 105, 140, 175],
                            gamma=0.5,
                            last_epoch=-1)

    # Training
    for epoch in range(opt.start_epoch, opt.nEpochs + 1):
        scheduler.step()
        print("Epoch = {}, lr = {}".format(epoch,
                                           optimizer.param_groups[0]["lr"]))
        train(train_loader, optimizer, model, criterion, epoch)
        val(val_loader, model, epoch)
        save_checkpoint(epoch, model, optimizer)
Example #8
0
def train_(train_set,test_set,lr, depth, mixup_enbale, alpha, model_checkpoint,epochs):

    torch.manual_seed(1)
    train_loader=torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=False, pin_memory=True,num_workers=2)
    test_loader=torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False, pin_memory=True,num_workers=2)
    network= Net(depth).to(device)
    optimizer = optim.SGD(network.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss().to(device)
#     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
    scheduler = MultiStepLR(optimizer, milestones=[30, 60, 80], gamma=0.2)
    
    acc_train=[]
    acc_test=[]
    acc = 0
    best_acc = 0
    for epoch in range(epochs):
        total_loss = 0
        total_correct = 0
        network.train()
        count_in = 0
        for batch in train_loader: #Get batch
            images,labels = batch
            images, labels = images.to(device), labels.to(device)
            
            if mixup_enbale:
                images, targets_a, targets_b, lam = mixup_data(images, labels, alpha)
                images, targets_a, targets_b = map(Variable, (images,
                                                          targets_a, targets_b))
                preds = network(images)
                loss = mixup_criterion(criterion, preds, targets_a, targets_b, lam)

                _, predicted = torch.max(preds.data, 1)
                correct = (lam * predicted.eq(targets_a.data).cpu().sum().float()
                + (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float())
                total_correct += correct
            
            if not mixup_enbale:
                preds=network(images) #pass batch to network
                correct = get_num_correct(preds, labels)
                loss = criterion(preds,labels) #Calculate loss
                total_correct+=correct
            
            optimizer.zero_grad()
            loss.backward() #Calculate gradients
            optimizer.step() #Update weights
            
            
        print("epoch: ", epoch,  "total_correct: ", total_correct.item() )
        print("training accuracy: ", total_correct.item() /len(train_set))
        acc_train.append(deepcopy(float(total_correct)/len(train_set)))

        with torch.no_grad():
            correct_test=0
            for batch_test in test_loader: #Get batch
                
                images_test,labels_test = batch_test
                images_test, labels_test = images_test.to(device), labels_test.to(device)
                preds_test=network(images_test) #pass batch to network
                correct_test += get_num_correct(preds_test, labels_test)
                
            print("testing accuracy: ", correct_test / len(test_set))
            if epoch == epochs - 1:
                print(correct_test / len(test_set))
                acc = correct_test / len(test_set) 
            acc_test.append(deepcopy(float(correct_test)/len(test_set)))
        scheduler.step()
        if best_acc < acc:
            best_acc = acc
            torch.save(network.state_dict(), model_checkpoint)

    return (acc_train,acc_test)
Example #9
0
def train_panet(device, resume=False, dataset_name='voc'):
    pre_trained_encoder_path = '../data/vgg16-397923af.pth' if cfg['panet'][
        'use_pretrained'] else None
    model = PANetFewShotSeg(in_channels=cfg[dataset_name]['channels'],
                            pretrained_path=pre_trained_encoder_path,
                            cfg={
                                'align': True
                            },
                            encoder_type=cfg['panet']['backbone']).to(device)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=cfg['panet']['lr'],
                                momentum=cfg['panet']['momentum'],
                                weight_decay=cfg['panet']['weight_decay'])
    scheduler = MultiStepLR(optimizer,
                            milestones=cfg['panet']['lr_milestones'],
                            gamma=0.1)
    epoch = 0
    model.train()

    if resume:
        epoch = load_state(cfg[dataset_name]['model_name'], model, optimizer,
                           scheduler)

    if dataset_name == 'voc':
        transforms = Compose([
            Resize(size=cfg['panet']['vgg_inp_size']),
        ])
    elif dataset_name == 'ircadb':
        transforms = Compose([
            Resize(size=cfg['panet']['unet_inp_size']),
        ])

    if dataset_name == 'voc':
        train_dataset = get_pascal_few_shot_datasets(
            range(1, 16), cfg['panet']['train_iterations'], cfg['nshot'],
            cfg['nquery'], transforms)
    elif dataset_name == 'ircadb':
        train_dataset = get_ircadb_few_shot_datasets(
            organs=[
                "bone", "spleen", "leftkidney", "rightkidney", "leftlung",
                "rightlung", "gallbladder"
            ],
            patient_ids=range(1, 16),
            iterations=cfg['panet']['train_iterations'],
            N_shot=cfg['nshot'],
            N_query=cfg['nquery'],
            transforms=transforms)

    trainloader = DataLoader(train_dataset,
                             batch_size=1,
                             shuffle=True,
                             num_workers=1,
                             pin_memory=True,
                             drop_last=True)

    criterion = nn.CrossEntropyLoss(ignore_index=255)

    log_loss = {'loss': 0, 'align_loss': 0}
    for i_iter, (support, query) in enumerate(tqdm(trainloader)):

        support_images = [[]]
        support_fg_mask = [[]]
        support_bg_mask = [[]]
        for i in range(len(support)):
            # print(support[i][0].shape)
            support_images[0].append(support[i][0].to(device))
            support_fg_mask[0].append(support[i][1].to(device))
            support_bg_mask[0].append(support[i][2].to(device))

        query_images = []
        query_labels = []

        for i in range(len(query)):
            query_images.append(query[i][0].to(device))
            query_labels.append(query[i][1].to(device))

        query_labels = torch.cat(query_labels, dim=0).long().to(device)

        # Forward and Backward
        optimizer.zero_grad()
        query_pred, align_loss = model(support_images, support_fg_mask,
                                       support_bg_mask, query_images)
        query_loss = criterion(query_pred, query_labels)
        loss = query_loss + align_loss * cfg['panet']['align_loss_scalar']
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Log loss
        query_loss = query_loss.detach().data.cpu().numpy()
        align_loss = align_loss.detach().data.cpu().numpy(
        ) if align_loss != 0 else 0
        log_loss['loss'] += query_loss
        log_loss['align_loss'] += align_loss

        # print loss and take snapshots
        if (i_iter + 1) % cfg['panet']['save_period'] == 0:
            loss = log_loss['loss'] / (i_iter + 1)
            align_loss = log_loss['align_loss'] / (i_iter + 1)
            print('\nstep {}: loss: {}, align_loss: {}'.format(
                i_iter + 1, loss, align_loss))
        if (i_iter + 1) % cfg['panet']['save_period'] == 0:
            save_state(cfg[dataset_name]['model_name'], model, optimizer,
                       scheduler, epoch + i_iter + 1)
            print("\nModel Saved On Iteration {} ...".format(epoch + i_iter +
                                                             1))

    return model
class Trainer(TorchTrainer):
    def __init__(self, config, validate_tracktor):
        super().__init__(config)
        self.validate_tracktor = validate_tracktor
        self.mse = nn.MSELoss(reduction='none')
        self.optim = torch.optim.Adam(context.model.parameters(),
                                      lr=context.cfg.lr,
                                      weight_decay=context.cfg.weight_decay)
        if context.cfg.scheduler_type == 'plateau':
            self.sched = ReduceLROnPlateau(self.optim,
                                           verbose=True,
                                           **context.cfg.scheduler_args)
        elif context.cfg.scheduler_type == 'multistep':
            self.sched = MultiStepLR(self.optim, **context.cfg.scheduler_args)
        else:
            raise ValueError(
                f'Unknown scheduler: {context.cfg.scheduler_type}')

        if context.cfg.use_box_coding:
            self.use_box_coding = True
            self.predict_coded_a = context.cfg.predict_coded_a
            self.loss_coded = context.cfg.loss_coded
            self.box_coder = BoxCoder(context.cfg.box_coding_weights)
        else:
            self.use_box_coding = False
            self.loss_coded = False

    def criterion(self, input, target):
        input = input.view(-1, 4)
        target = target.view(-1, 4)
        assert context.cfg.loss == 'mse'
        return self.mse(input, target).mean()

    def criterion_coded(self, prediction, x, target, last_coded):
        target_coded = self.box_coder.encode(list(target[:, :, :4]),
                                             list(x[:, [-1], :4]))
        last_coded = last_coded[:, [-1], :4]
        loss = self.criterion(prediction[:, :, :4],
                              torch.stack(target_coded) - last_coded)
        return loss

    def epoch(self):
        loss_epoch = []
        loss_lengths = [[], [], [], [], [],
                        []]  # losses for different episode lengths
        iou_epoch = []
        miou_epoch = []

        all_input = []
        all_out = []
        all_diffs = []
        all_gt = []
        all_pred_pos = []
        all_prev_pos = []

        for boxes_in, boxes_target, boxes_resized, image_features, image_sizes, lengths, levels in tqdm(
                context.data_loader):
            # move tensors to GPU
            boxes_in = boxes_in.cuda()
            boxes_target = boxes_target.cuda()
            boxes_resized = boxes_resized.cuda()
            # in case we're working with float16 features, only convert to float32 once they're on the gpu
            if isinstance(image_features, list):
                image_features = [
                    feat.cuda().float() for feat in image_features
                ]
            else:
                image_features = image_features.cuda().float()

            diffs = torch.zeros(boxes_in.shape[0],
                                context.cfg.model_args['input_length'],
                                6).cuda()
            if self.use_box_coding:
                encoded = self.box_coder.encode(list(boxes_in[:, 1:, :4]),
                                                list(boxes_in[:, :-1, :4]))
                diffs[:, :, :4] = torch.stack(encoded, dim=0)
            else:
                # raises error if model and dataset lengths do not match
                diffs[:, :, :4] = boxes_in[:, 1:, :4] - boxes_in[:, :-1, :4]
            diffs[:, :, 5] = 1.
            diffs[(boxes_in[:, :, 5] == 0.)[:, :-1]] = 0.

            if not context.validate:
                self.optim.zero_grad()
                do_tf = context.cfg.teacher_forcing > 0 and np.random.uniform(
                ) < context.cfg.teacher_forcing
                out = context.model(diffs, boxes_target, boxes_resized,
                                    image_features, image_sizes, lengths,
                                    do_tf)
            else:
                out = context.model.predict(diffs, boxes_resized,
                                            image_features, image_sizes,
                                            lengths, boxes_target.shape[1])

            assert out.shape[1] == 1
            last_input = boxes_in[:, -1, :].unsqueeze(1)

            if self.use_box_coding:
                if self.predict_coded_a:
                    # out is the acceleration in encoding space
                    last_offset = diffs[:, [-1], :4]
                    pred_offset = last_offset + out[:, :, :4]
                    pred_pos = self.box_coder.decode(list(pred_offset),
                                                     list(last_input))
                else:
                    # out is the absolute encoded offset
                    pred_pos = self.box_coder.decode(list(out[:, :, :4]),
                                                     list(last_input))
            else:
                pred_pos = last_input[:, :, :4] + diffs[:, [-1], :
                                                        4] + out[:, :, :4]

            # calculate loss
            if self.use_box_coding:
                last_coded = diffs[:, [-1], :4]
                loss = self.criterion_coded(out, boxes_in, boxes_target,
                                            last_coded)
            else:
                loss = self.criterion(pred_pos, boxes_target[:, :, :4])

            loss_epoch.append(loss.detach().cpu())

            # DIFFERENT LENGTH ANALYSIS
            for i, loss_list in zip(range(2, 8), loss_lengths):
                mask = lengths == i
                if mask.any():
                    if self.use_box_coding:
                        loss_part = self.criterion_coded(
                            out[mask], boxes_in[mask], boxes_target[mask],
                            diffs[mask])
                    else:
                        loss_part = self.criterion(
                            pred_pos[mask], boxes_target[:, :, :4][mask])
                    loss_list.append(loss_part.detach().cpu())

            if not context.validate and context.epoch > 0:
                loss.backward()
                # nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                self.optim.step()

            all_input.append(boxes_in.detach().cpu())
            all_out.append(out.detach().cpu())
            all_diffs.append(diffs.detach().cpu())
            all_gt.append(boxes_target[:, :, :4].detach().cpu())
            all_pred_pos.append(pred_pos.detach().cpu())
            all_prev_pos.append(
                torch.cat([
                    last_input[:, :, :4].detach().cpu(),
                    boxes_target[:, :-1, :4].detach().cpu()
                ],
                          dim=1))

            # evaluate iou
            iou = jaccard(
                pred_pos.view(-1, 4).detach(),
                boxes_target[:, :, :4].view(-1, 4).detach())
            iou = iou[~torch.isnan(iou)]
            iou_epoch.append(iou)
            miou_epoch.append((iou > 0.7).sum().float() / len(iou))

        all_input = torch.cat(all_input)
        all_out = torch.cat(all_out)
        all_diffs = torch.cat(all_diffs)
        all_gt = torch.cat(all_gt)
        all_pred_pos = torch.cat(all_pred_pos)
        all_prev_pos = torch.cat(all_prev_pos)

        assert all_prev_pos.shape[1] == all_gt.shape[1] == all_pred_pos.shape[
            1] == 1
        eval_df = evaluate_classes(all_prev_pos.squeeze(1), all_gt.squeeze(1),
                                   all_pred_pos.squeeze(1))['df']

        # calculate cva performance for current epoch
        diff = all_input[:, 1:, :4] - all_input[:, :-1, :4]
        m = (all_input[:, :, 5] == 1.)[:, :-1].unsqueeze(1).float()
        v_mean = torch.bmm(m, diff) / m.sum(dim=2).unsqueeze(2)
        # set NaNs to zero (https://discuss.pytorch.org/t/how-to-set-nan-in-tensor-to-0/3918/4)
        v_mean[v_mean != v_mean] = 0.
        v_mean = v_mean.squeeze(1)

        pred_cva = all_input[:, -1, :4] + v_mean
        val_mask = ((pred_cva[:, 2] - pred_cva[:, 0]) >= 0) & (
            (pred_cva[:, 3] - pred_cva[:, 1]) >= 0)

        iou_cva = jaccard(pred_cva, all_gt.squeeze(1)).mean()
        miou_cva = (jaccard(pred_cva, all_gt.squeeze(1)) >
                    0.7).sum().float() / len(pred_cva)

        if self.use_box_coding:
            offset_cva = self.box_coder.encode(
                list(pred_cva[val_mask].unsqueeze(1)[:, :, :4]),
                list(all_input[val_mask][:, [-1], :4]))
            coded_cva = torch.stack(offset_cva) - all_diffs[val_mask][:,
                                                                      [-1], :4]
            loss_cva = self.criterion_coded(coded_cva, all_input[val_mask],
                                            all_gt[val_mask],
                                            all_diffs[val_mask])
        else:
            loss_cva = self.criterion(pred_cva, all_gt.squeeze(1))

        if context.validate:
            if self.use_box_coding:
                loss_epoch = self.criterion_coded(all_out, all_input, all_gt,
                                                  all_diffs)
            else:
                loss_epoch = self.criterion(
                    all_pred_pos.squeeze(1)[:, :4], all_gt.squeeze(1)).mean()

            iou = jaccard(all_pred_pos.squeeze(1)[:, :4], all_gt.squeeze(1))
            iou_epoch = iou.mean()
            miou_epoch = ((iou > 0.7).sum().float() / len(iou))
            with open(context.log_path / f'{context.epoch}_df_val.txt',
                      'w') as fh:
                fh.write(eval_df.to_string())

            if context.cfg.scheduler_type == 'plateau':
                self.sched.step(loss_epoch, epoch=context.epoch)
            elif context.cfg.scheduler_type == 'multistep':
                self.sched.step(epoch=context.epoch)

        else:
            loss_epoch = torch.tensor(loss_epoch).mean()
            iou_epoch = torch.cat(iou_epoch).mean()
            miou_epoch = torch.stack(miou_epoch).float().mean()
            with open(context.log_path / f'{context.epoch}_df_train.txt',
                      'w') as fh:
                fh.write(eval_df.to_string())

        metrics = {
            'loss': loss_epoch,
            'iou': iou_epoch,
            'miou': miou_epoch,
            'iou_cva': iou_cva,
            'miou_cva': miou_cva,
            'loss_cva': loss_cva,
            'loss_1': torch.tensor(loss_lengths[0]).mean(),
            'loss_2': torch.tensor(loss_lengths[1]).mean(),
            'loss_3': torch.tensor(loss_lengths[2]).mean(),
            'loss_4': torch.tensor(loss_lengths[3]).mean(),
            'loss_5': torch.tensor(loss_lengths[4]).mean(),
            'loss_6': torch.tensor(loss_lengths[5]).mean()
        }

        if context.epoch % context.cfg.tracktor_val_every == 0 and context.validate:
            with torch.no_grad():
                metrics = {
                    **metrics,
                    **self.validate_tracktor(context.model, context.epoch)
                }

        return metrics
Example #11
0
def train(args):
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True
    np.random.seed(args.seed)
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # -------------------- Load data ----------------------------------
    transform = transforms.Compose([
        Rescale((224, 224)),
        ColorJitter(0.5, 0.5, 0.5, 0.3, 0.5),
        ToTensor(),
    ])
    dataset = FaceDataset(args.train_data, True, transform=transform)
    data_loader = DataLoader(dataset,
                             shuffle=True,
                             batch_size=args.batch_size,
                             drop_last=True,
                             num_workers=4)

    # ----------------- Define networks ---------------------------------
    Gnet = SketchNet(in_channels=3, out_channels=1, norm_type=args.Gnorm)
    Dnet = DNet(norm_type=args.Dnorm)
    vgg19_model = vgg19(args.vgg19_weight)

    gpu_ids = [int(x) for x in args.gpus.split(',')]
    if len(gpu_ids) > 0:
        Gnet.cuda()
        Dnet.cuda()
        Gnet = nn.DataParallel(Gnet, device_ids=gpu_ids)
        Dnet = nn.DataParallel(Dnet, device_ids=gpu_ids)
        vgg19_model = nn.DataParallel(vgg19_model, device_ids=gpu_ids)

    Gnet.train()
    Dnet.train()

    if args.resume:
        weights = glob(os.path.join(args.save_weight_path, '*-*.pth'))
        weight_path = sorted(weights)[-1][:-5]
        Gnet.load_state_dict(torch.load(weight_path + 'G.pth'))
        Dnet.load_state_dict(torch.load(weight_path + 'D.pth'))

    # ---------------- set optimizer and learning rate ---------------------
    args.epochs = np.ceil(args.epochs * 1000 / len(dataset))
    args.epochs = max(int(args.epochs), 4)
    ms = [int(1. / 4 * args.epochs), int(2. / 4 * args.epochs)]

    optim_G = Adam(Gnet.parameters(), args.lr)
    optim_D = Adam(Dnet.parameters(), args.lr)
    scheduler_G = MultiStepLR(optim_G, milestones=ms, gamma=0.1)
    scheduler_D = MultiStepLR(optim_D, milestones=ms, gamma=0.1)
    mse_crit = nn.MSELoss()

    # ---------------------- Define reference styles and feature loss layers ----------
    if args.train_style == 'cufs':
        ref_style_dataset = ['CUHK_student', 'AR', 'XM2VTS']
        ref_feature = './data/cufs_feature_dataset.pth'
        ref_img_list = './data/cufs_reference_img_list.txt'
    elif args.train_style == 'cufsf':
        ref_style_dataset = ['CUFSF']
        ref_feature = './data/cufsf_feature_dataset.pth'
        ref_img_list = './data/cufsf_reference_img_list.txt'
    else:
        assert 1 == 0, 'Train style {} not supported.'.format(args.train_style)

    vgg_feature_layers = ['r11', 'r21', 'r31', 'r41', 'r51']
    feature_loss_layers = list(
        itertools.compress(vgg_feature_layers, args.flayers))

    log = logger.Logger(args.save_weight_path)

    for e in range(args.epochs):
        scheduler_G.step()
        scheduler_D.step()
        sample_count = 0
        for batch_idx, batch_data in enumerate(data_loader):
            # ---------------- Load data -------------------
            start = time()
            train_img, train_img_org = [
                utils.tensorToVar(x) for x in batch_data
            ]
            topk_sketch_img, topk_photo_img = search_dataset.find_photo_sketch_batch(
                train_img_org,
                ref_feature,
                ref_img_list,
                vgg19_model,
                dataset_filter=ref_style_dataset,
                topk=args.topk)
            random_real_sketch = search_dataset.get_real_sketch_batch(
                train_img.size(0),
                ref_img_list,
                dataset_filter=ref_style_dataset)
            end = time()
            data_time = end - start
            sample_count += train_img.size(0)

            # ---------------- Model forward -------------------
            start = time()
            fake_sketch = Gnet(train_img)
            fake_score = Dnet(fake_sketch)
            real_score = Dnet(random_real_sketch)

            real_label = torch.ones_like(fake_score)
            fake_label = torch.zeros_like(fake_score)

            # ----------------- Calculate loss and backward -------------------
            train_img_org_vgg = img_process.subtract_mean_batch(
                train_img_org, 'face')
            topk_sketch_img_vgg = img_process.subtract_mean_batch(
                topk_sketch_img, 'sketch')
            topk_photo_img_vgg = img_process.subtract_mean_batch(
                topk_photo_img, 'face')
            fake_sketch_vgg = img_process.subtract_mean_batch(
                fake_sketch.expand_as(train_img_org), 'sketch', args.meanshift)

            style_loss = loss.feature_mrf_loss_func(
                fake_sketch_vgg,
                topk_sketch_img_vgg,
                vgg19_model,
                feature_loss_layers, [train_img_org_vgg, topk_photo_img_vgg],
                topk=args.topk)

            tv_loss = loss.total_variation(fake_sketch)

            # GAN Loss
            adv_loss = mse_crit(fake_score, real_label) * args.weight[1]
            tv_loss = tv_loss * args.weight[2]
            loss_G = style_loss * args.weight[0] + adv_loss + tv_loss
            loss_D = 0.5 * mse_crit(fake_score, fake_label) + 0.5 * mse_crit(
                real_score, real_label)

            # Update parameters
            optim_D.zero_grad()
            loss_D.backward(retain_graph=True)
            optim_D.step()

            optim_G.zero_grad()
            loss_G.backward()
            optim_G.step()

            end = time()
            train_time = end - start

            # ----------------- Print result and log the output -------------------
            log.iterLogUpdate(loss_G.data[0])
            if batch_idx % 100 == 0:
                log.draw_loss_curve()

            msg = "{:%Y-%m-%d %H:%M:%S}\tEpoch [{:03d}/{:03d}]\tBatch [{:03d}/{:03d}]\tData: {:.2f}  Train: {:.2f}\tLoss: G-{:.4f}, Adv-{:.4f}, tv-{:.4f}, D-{:.4f}".format(
                datetime.now(), e, args.epochs, sample_count, len(dataset),
                data_time, train_time,
                *[x.data[0] for x in [loss_G, adv_loss, tv_loss, loss_D]])
            print(msg)
            log_file = open(os.path.join(args.save_weight_path, 'log.txt'),
                            'a+')
            log_file.write(msg + '\n')
            log_file.close()

        save_weight_name = "epochs-{:03d}-".format(e)
        G_cpu_model = copy.deepcopy(Gnet).cpu()
        D_cpu_model = copy.deepcopy(Dnet).cpu()
        torch.save(
            G_cpu_model.state_dict(),
            os.path.join(args.save_weight_path, save_weight_name + 'G.pth'))
        torch.save(
            D_cpu_model.state_dict(),
            os.path.join(args.save_weight_path, save_weight_name + 'D.pth'))
Example #12
0
class Experiment:
    def __init__(self, config_file="config.json"):
        # read config.json file
        if os.path.isfile(config_file):
            with open(config_file) as json_file:
                config = json.load(json_file)
                self.config = config
        else:
            raise Exception("file does not exist: %s" % config_file)
        # read in
        root = config["dataset"]["root"]
        self.root = os.path.abspath(root)
        self.num_epoch = config["num_epoch"]
        self.warmup_epoch = config["train"]["G_warming"]
        self.batch_size = config["dataset"]["batch_size"]
        self.G_path = config["model"]["G_path"]
        self.D_path = config["model"]["D_path"]
        # edge promoting
        if not os.path.isdir(os.path.join(self.root, "edge_smoothed")):
            src_dir = os.path.join(self.root, "violet", "train")
            target_dir = os.path.join(self.root, "edge_smoothed")
            utils.edge_promoting(src_dir, target_dir)
        else:
            print("edge-promoting already done %s" %
                  os.path.join(self.root, "edge_smoothed"))
        # initialize dataset
        train_real_dataset = MyDataset(self.root, style="real", mode="train")
        train_anim_dataset = MyDataset(self.root,
                                       style="edge_smoothed",
                                       mode="")

        val_real_dataset = MyDataset(self.root, style="real", mode="valid")
        val_anim_dataset = MyDataset(self.root, style="violet", mode="valid")
        test_dataset = MyDataset(self.root, style="real", mode="test")
        self.train_real_loader = DataLoader(train_real_dataset,
                                            batch_size=self.batch_size,
                                            shuffle=True,
                                            num_workers=12)
        self.train_anim_loader = DataLoader(train_anim_dataset,
                                            batch_size=self.batch_size,
                                            shuffle=True,
                                            num_workers=12)
        self.val_real_loader = DataLoader(val_real_dataset,
                                          batch_size=self.batch_size,
                                          shuffle=True,
                                          num_workers=12)
        self.val_anim_loader = DataLoader(val_anim_dataset,
                                          batch_size=self.batch_size,
                                          shuffle=True,
                                          num_workers=12)

        self.test_loader = ...

        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        print("Using device: ", self.device)

        # initialize Discriminator and Generator
        self.D = model.discriminator()
        self.D.to(self.device)

        self.G = model.generator()
        self.G.to(self.device)

        # initialize vgg19 pretrained model
        self.vgg19 = torchvision.models.vgg19(pretrained=True)
        self.vgg19.to(self.device)
        self.vgg19.eval()

        # initialize optimizer
        self.D_optimizer = optim.Adam(self.D.parameters(),
                                      config["optim"]["D_lr"],
                                      betas=(0.5, 0.99))
        self.G_optimizer = optim.Adam(self.G.parameters(),
                                      config["optim"]["G_lr"],
                                      betas=(0.5, 0.99))

        # initialize loss function
        self.BCE_loss = nn.BCELoss().to(self.device)
        self.L1_loss = nn.L1Loss().to(self.device)
        self.content_loss_lambda = 10

        # initialize scheduler
        self.D_scheduler = MultiStepLR(self.D_optimizer,
                                       config["optim"]["D_step"],
                                       config["optim"]["D_gamma"])
        self.G_scheduler = MultiStepLR(self.G_optimizer,
                                       config["optim"]["G_step"],
                                       config["optim"]["G_gamma"])

    def _train(self, e):
        """
        train the model for 1 epoch
        :return:
        """
        # put model to training mode
        self.D.train()
        self.G.train()

        # arrays to store the losses
        D_losses = []
        G_losses = []
        Content_losses = []

        for i, data in enumerate(
                zip(self.train_real_loader, self.train_anim_loader)):
            src, anim = data[0], data[1]

            origin_anim = anim[:, :, :, :256]
            edge_smooth_anim = anim[:, :, :, 256:]

            src = src.to(self.device)
            edge_smooth_anim, origin_anim = edge_smooth_anim.to(
                self.device), origin_anim.to(self.device)

            # train discriminator...

            # discriminate real anime image
            D_real = self.D(origin_anim)
            D_real_loss = self.BCE_loss(
                D_real, torch.ones_like(D_real, device=self.device))

            # discriminate generated/fake anime image
            fake_anim = self.G(src)
            D_fake = self.D(fake_anim)
            D_fake_loss = self.BCE_loss(
                D_fake, torch.zeros_like(D_fake, device=self.device))

            # discriminate real anime image without clear edges
            D_edge = self.D(edge_smooth_anim)
            D_edge_loss = self.BCE_loss(
                D_edge, torch.zeros_like(D_edge, device=self.device))

            D_loss = D_real_loss + D_fake_loss + D_edge_loss
            self.D_optimizer.zero_grad()
            D_loss.backward()
            self.D_optimizer.step()

            # train generator...

            # generated/fake anime image
            fake_anim = self.G(src)
            D_fake = self.D(fake_anim)
            D_fake_loss = self.BCE_loss(
                D_fake, torch.ones_like(D_fake, device=self.device))

            # content loss (L1)
            src_feature = self.vgg19((src + 1) / 2)
            G_feature = self.vgg19((fake_anim + 1) / 2)
            Content_loss = self.content_loss_lambda * self.L1_loss(
                G_feature, src_feature.detach())

            G_loss = D_fake_loss + Content_loss
            self.G_optimizer.zero_grad()
            G_loss.backward()
            self.G_optimizer.step()

            print(
                "Epoch: %s, Index: %s, Discriminator loss: %.3f, Generator loss: %.3f, Content loss: %.3f"
                % (e, i, D_loss.item(), G_loss.item(), Content_loss.item()))
            D_losses.append(D_loss.item())
            G_losses.append(G_loss.item())
            Content_losses.append(Content_loss.item())

        average_D_loss = np.mean(D_losses)
        average_G_loss = np.mean(G_losses)
        average_content_loss = np.mean(Content_losses)

        print()
        print(
            "Average: Epoch: %s, Discriminator loss: %.3f, Generator loss: %.3f, Content loss: %.3f"
            % (e, average_D_loss, average_G_loss, average_content_loss))
        print()

        self.G_scheduler.step()
        self.D_scheduler.step()
        return average_D_loss, average_G_loss, average_content_loss

    def _train_warming(self, e):
        """
        warm up the model for 1 epoch
        :return:
        """
        # put generator to training mode
        self.G.train()

        # arrays to store the losses
        Content_losses = []

        for i, src in enumerate(self.train_real_loader):
            src = src.to(self.device)

            # train generator

            # generated/fake anime image
            fake_anim = self.G(src)

            # content loss (L1)
            src_feature = self.vgg19((src + 1) / 2)
            G_feature = self.vgg19((fake_anim + 1) / 2)
            Content_loss = self.content_loss_lambda * self.L1_loss(
                G_feature, src_feature.detach())

            self.G_optimizer.zero_grad()
            Content_loss.backward()
            self.G_optimizer.step()

            print("Epoch: %s, Index: %s, Content loss: %.3f" %
                  (e, i, Content_loss.item()))
            Content_losses.append(Content_loss.item())

        average_content_loss = np.mean(Content_losses)
        print()
        print("Epoch: %s, Average content loss: %.3f" %
              (e, average_content_loss))
        print()
        return average_content_loss

    def _valid(self, e, pretrain=False):  # use e for image names
        save_path = os.path.join(self.config["valid"]["save_path"])
        with torch.no_grad():
            self.G.eval()
            for i, src in enumerate(self.val_real_loader):
                src = src.to(self.device)
                generated_img = self.G(src)
                result = torch.cat((src[0], generated_img[0]), 2)
                result = (result.cpu().numpy().transpose(1, 2, 0) + 1) / 2

                filename = ""
                if pretrain == True:
                    filename = "pretrain_%s_%s.png" % (e, i)
                elif pretrain == False:
                    filename = "during_train_%s_%s.png" % (e, i)
                path = os.path.join(save_path, filename)

                plt.imsave(path, result)
                if i == 6:
                    break

    def run(self):
        warm_up_content_losses = []  # store the average loss at each epoch
        training_D_losses = []
        training_G_losses = []
        training_content_losses = []

        print("start warming up")
        for e in range(self.warmup_epoch):
            curr_content_loss = self._train_warming(e)
            warm_up_content_losses.append(curr_content_loss)
            self._valid(e, True)

        print("start training and validating")
        for e in range(self.num_epoch):
            curr_D_loss, curr_G_loss, curr_content_loss = self._train(e)
            training_D_losses.append(curr_D_loss)
            training_G_losses.append(curr_G_loss)
            training_content_losses.append(curr_content_loss)
            self._valid(e, False)

        return warm_up_content_losses, training_D_losses, training_G_losses, training_content_losses

    def _save_model(self, epoch, D_state, G_state, D_optim_state,
                    G_optim_state):
        """ save model """
        torch.save(
            {
                "epoch": epoch,
                "D_state": D_state,
                "D_optim_state": D_optim_state
            }, os.path.join(self.D_path))
        torch.save(
            {
                "epoch": epoch,
                "G_state": G_state,
                "G_optim_state": G_optim_state
            }, os.path.join(self.G_path))

    def _test(self):
        return
def train_model(model, args):
    # Image normalization
    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [109.9, 109.7, 113.8]],
        std=[x / 255.0 for x in [50.1, 50.6, 50.8]])

    # Compose transformations to be applied on image
    train_transform = transforms.Compose([])
    train_transform.transforms.append(transforms.Resize((54, 54)))
    train_transform.transforms.append(transforms.ToTensor())
    train_transform.transforms.append(normalize)

    # Define train data set and data loader
    train_dataset = Dataset(args.train_lmdb_path, train_transform)

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True)

    # Initialize loss, optimizer, scheduler
    loss_object = Custom_loss()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=0.9,
                                nesterov=True,
                                weight_decay=0.0005)

    scheduler = MultiStepLR(optimizer, milestones=[80, 120], gamma=0.1)

    best_accuracy = 0
    training_accuracies = dict()
    training_losses = dict()
    validation_accuracies = dict()
    validation_losses = dict()

    # Start training
    for epoch in range(args.epochs):
        total_epoch_loss = 0
        total_epoch_accuracy = 0
        print(f'Starting training for epoch {epoch}')
        for i, data in enumerate(train_loader):
            print(f'Starting training for iteration {i}')
            images, lengths, labels = data
            if torch.cuda.is_available():
                images = images.cuda()
                lengths = lengths.cuda()
                labels = labels.cuda()
                model.cuda()
                loss_object = loss_object.cuda()

            pred_length, pred_digit1, pred_digit2, pred_digit3, pred_digit4, pred_digit5 = model(
                images)

            loss = loss_object.loss(pred_length, pred_digit1, pred_digit2,
                                    pred_digit3, pred_digit4, pred_digit5,
                                    lengths, labels) / args.batch_size
            total_epoch_loss += loss

            total_epoch_accuracy += calculate_accuracy(
                pred_length, pred_digit1, pred_digit2, pred_digit3,
                pred_digit4, pred_digit5, lengths, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        training_accuracies[epoch] = total_epoch_accuracy / len(train_loader)
        training_losses[epoch] = total_epoch_loss / len(train_loader)

        validation_loss, validation_accuracy = validation(
            model, args.val_lmdb_path, args.batch_size, loss_object)

        validation_losses[epoch] = validation_loss
        validation_accuracies[epoch] = validation_accuracy

        if validation_accuracy > best_accuracy:
            torch.save(model.state_dict(),
                       args.weights_path + '/' + epoch + '.pt')

        scheduler.step(epoch)
    return training_losses, training_accuracies, validation_accuracies, validation_losses
Example #14
0
def main():
    if not os.path.isdir(opt.save_path):
        os.makedirs(opt.save_path)
    # Load dataset
    print('Loading dataset ...\n')
    if (opt.data_path.find('Light') != -1
            or opt.data_path.find('Heavy') != -1):
        dataset_train = newDataset(data_path=opt.data_path)
    else:
        dataset_train = Dataset(data_path=opt.data_path)

    # dataset_val = Dataset(train=False)
    loader_train = DataLoader(dataset=dataset_train,
                              num_workers=4,
                              batch_size=opt.batchSize,
                              shuffle=True)
    print("# of training samples: %d\n" % int(len(dataset_train)))
    # Build model

    net = BRN(recurrent_iter=opt.inter_iter, use_GPU=opt.use_GPU)
    net = nn.DataParallel(net)
    #print_network(net)

    #criterion = nn.MSELoss(size_average=False)
    criterion = pytorch_ssim.SSIM()

    # Move to GPU

    model = net.cuda()
    criterion.cuda()
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=opt.lr)
    scheduler = MultiStepLR(optimizer, milestones=[30, 50, 80],
                            gamma=0.2)  # learning rates
    #scheduler = MultiStepLR(optimizer, milestones=[120, 140], gamma=0.2)
    # training
    writer = SummaryWriter(opt.save_path)
    step = 0

    initial_epoch = findLastCheckpoint(
        save_dir=opt.save_path)  # load the last model in matconvnet style
    if initial_epoch > 0:
        print('resuming by loading epoch %03d' % initial_epoch)
        model.load_state_dict(
            torch.load(
                os.path.join(opt.save_path,
                             'net_epoch%d.pth' % initial_epoch)))

    for epoch in range(initial_epoch, opt.epochs):

        scheduler.step(epoch)
        # set learning rate
        for param_group in optimizer.param_groups:
            # param_group["lr"] = current_lr
            print('learning rate %f' % param_group["lr"])
        # train
        for i, (input, target) in enumerate(loader_train, 0):
            # training step
            model.train()
            model.zero_grad()
            optimizer.zero_grad()

            # rain = input - target
            input_train, target_train = Variable(input.cuda()), Variable(
                target.cuda())

            out_train, _, _, _ = model(input_train)

            pixel_loss = criterion(target_train, out_train)
            #mse = criterion(input_train1 - target_train, r)

            loss = (-pixel_loss)  #+ mse
            loss.backward()

            optimizer.step()
            # results
            model.eval()
            with torch.no_grad():
                out_train, _, _, _ = model(input_train)
                out_train = torch.clamp(out_train, 0., 1.)
                #out_r_train = torch.clamp(out_r_train, 0., 1.)
                psnr_train = batch_PSNR(out_train, target_train, 1.)
            #psnr_train_r = batch_PSNR(out_r_train, rain_train, 1.)
            print(
                "[epoch %d][%d/%d] loss: %.4f, PSNR_train: %.4f" %
                (epoch + 1, i + 1, len(loader_train), loss.item(), psnr_train))
            # if you are using older version of PyTorch, you may need to change loss.item() to loss.data[0]

            if step % 10 == 0:
                # Log the scalar values
                writer.add_scalar('loss', loss.item(), step)
                writer.add_scalar('PSNR on training data', psnr_train, step)
                # writer.add_scalar('loss_r', loss_r.item(), step)
                #writer.add_scalar('PSNR_r on training data', psnr_train_r, step)
            step += 1
        ## the end of each epoch

        model.eval()

        with torch.no_grad():
            # log the images
            out_train, _, _, _ = model(input_train)
            out_train = torch.clamp(out_train, 0., 1.)
            #out_r_train = torch.clamp(out_r_train, 0., 1.)
            Img = utils.make_grid(target_train.data,
                                  nrow=8,
                                  normalize=True,
                                  scale_each=True)
            Imgn = utils.make_grid(input_train.data,
                                   nrow=8,
                                   normalize=True,
                                   scale_each=True)
            Irecon = utils.make_grid(out_train.data,
                                     nrow=8,
                                     normalize=True,
                                     scale_each=True)
        #rainstreak = utils.make_grid(out_r_train.data, nrow=8, normalize=True, scale_each=True)
        writer.add_image('clean image', Img, epoch)
        writer.add_image('noisy image', Imgn, epoch)
        writer.add_image('reconstructed image', Irecon, epoch)
        #writer.add_image('estimated rain image', rainstreak, epoch)
        # save model
        torch.save(model.state_dict(),
                   os.path.join(opt.save_path, 'net_latest.pth'))

        if epoch % opt.save_freq == 0:
            torch.save(
                model.state_dict(),
                os.path.join(opt.save_path, 'net_epoch%d.pth' % (epoch + 1)))
def main():
    logger = get_logger()

    parser = common_parser(pac_bayes=True)
    args = parser.parse_args()
    check_args(args, pac_bayes=True)

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    is_criterion_val_loss = args.criterion == 'loss'

    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    rnd = np.random.RandomState(args.seed)
    iid = not args.non_iid

    device = torch.device('cuda' if use_cuda else 'cpu')

    contrastive_loss = ContrastiveLoss(loss_name=args.loss, device=device)

    # `num_blocks_per_class` is ignored due to non-iid setting.
    train_loader, val_loader = get_contrastive_data_loaders(
        rnd=rnd,
        data_name='auslan',
        validation_ratio=args.validation_ratio,
        mini_batch_size=args.batch_size,
        num_blocks_per_class=45 *
        24,  # this value is ignored when iid is False.
        block_size=args.block_size,
        neg_size=args.neg_size,
        root=args.root,
        include_test=False,
        iid=iid)

    num_training_samples = len(train_loader.dataset)
    if val_loader is None:
        num_val_samples = 0
    else:
        num_val_samples = len(val_loader.dataset)
        if args.criterion == 'pb':
            logger.warn(
                'You can pass 0. to `validation-ratio` argument. It could make performance better.'
            )

    logger.info('# training samples: {} # val samples: {}\n'.format(
        num_training_samples, num_val_samples))
    logger.info(
        'PAC-Bayes parameters: b: {}, c: {}, δ: {}, prior log std: {}\n'.
        format(args.b, args.c, args.delta, args.prior_log_std))

    model = StochasticMLP(
        num_training_samples=num_training_samples,
        rnd=rnd,
        num_last_units=args.dim_h,
        catoni_lambda=args.catoni_lambda,
        b=args.b,
        c=args.c,
        delta=args.delta,
        prior_log_std=args.prior_log_std,
    ).to(device)

    optimizer_name = args.optim.lower()

    if optimizer_name == 'adam':
        optimizer = optim.Adam(params=model.parameters(), lr=args.lr)
    elif optimizer_name == 'sgd':
        optimizer = optim.SGD(params=model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum)
    elif optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(params=model.parameters(), lr=args.lr)
    else:
        raise ValueError(
            'Optimizer must be adam, sgd, or rmsprop. Not {}'.format(
                optimizer_name))

    logger.info('optimizer: {}\n'.format(optimizer_name))

    scheduler = MultiStepLR(optimizer,
                            milestones=args.schedule,
                            gamma=args.gamma)

    if is_criterion_val_loss:
        early_stoppings = {
            'stochastic': EarlyStopping(mode='min', patience=args.patience),
            'deterministic': EarlyStopping(mode='min', patience=args.patience),
        }

        learning_histories = {
            'stochastic': {
                'val_loss': []
            },
            'deterministic': {
                'val_loss': []
            }
        }
    else:
        learning_history = {args.criterion: []}

    save_name = 'lr-{}_{}_{}_{}'.format(args.lr, optimizer_name,
                                        args.criterion, args.output_model_name)
    if is_criterion_val_loss:
        save_names = dict()

        save_names['stochastic'] = 'lr-{}_{}_{}_stochastic_{}'.format(
            args.lr, optimizer_name, args.criterion, args.output_model_name)
        save_names['deterministic'] = 'lr-{}_{}_{}_deterministic_{}'.format(
            args.lr, optimizer_name, args.criterion, args.output_model_name)

    if iid:
        T = 0.
    else:
        T = args.block_size

    for epoch in range(1, args.epoch + 1):
        average_objective = train(args, model, device, train_loader, optimizer,
                                  epoch, contrastive_loss, T, logger)
        scheduler.step()

        # calculate criterion value for early-stopping
        if is_criterion_val_loss:
            delete_keys = []
            for eval_type, early_stopping in early_stoppings.items():
                is_deterministic = eval_type == 'deterministic'

                val_loss = validation_loss(args,
                                           model,
                                           device,
                                           val_loader,
                                           contrastive_loss,
                                           logger,
                                           args.num_snn,
                                           deterministic=is_deterministic)

                learning_histories[eval_type]['val_loss'].append(val_loss)

                # check early_stopping
                is_stopped = early_stopping.is_stopped_and_save(
                    val_loss, model, save_name=save_names[eval_type])

                if is_stopped:
                    delete_keys.append(eval_type)
                    learning_histories[eval_type][
                        'lowest_val_loss'] = early_stopping.best

            for delete_key in delete_keys:
                logger.info('Remove {} evaluation\n'.format(delete_key))
                del early_stoppings[delete_key]

            # if early stopping dict becomes empty, stop the training
            if not early_stoppings:
                break

        else:
            learning_history[args.criterion].append(average_objective)

    # save learning history to json
    if is_criterion_val_loss:
        # store the lowest validation loss
        for eval_type, early_stopping in early_stoppings.items():
            filed_name = 'lowest_val_loss'
            learning_histories[eval_type][filed_name] = early_stopping.best

        for eval_type, fname in save_names.items():
            json_fname = fname.replace('.pt', '.json')
            with open(json_fname, 'w') as log_file:
                json.dump(learning_histories[eval_type], log_file)

    else:
        torch.save(model.state_dict(), save_name)
        json_fname = save_name.replace('.pt', '.json')
        with open(json_fname, 'w') as log_file:
            json.dump(learning_history, log_file)
Example #16
0
class ExperimentBuilder(nn.Module):
    def __init__(self,
                 network_model,
                 experiment_name,
                 num_epochs,
                 train_data,
                 val_data,
                 test_data,
                 use_gpu,
                 continue_from_epoch=-1,
                 scheduler=None,
                 optimiser=None,
                 sched_params=None,
                 optim_params=None,
                 pretrained_weights_locations=None):
        """
        Initializes an ExperimentBuilder object. Such an object takes care of running training and evaluation of a deep net
        on a given dataset. It also takes care of saving per epoch models and automatically inferring the best val model
        to be used for evaluating the test set metrics.
        :param network_model: A pytorch nn.Module which implements a network architecture.
        :param experiment_name: The name of the experiment. This is used mainly for keeping track of the experiment and creating and directory structure that will be used to save logs, model parameters and other.
        :param num_epochs: Total number of epochs to run the experiment
        :param train_data: An object of the DataProvider type. Contains the training set.
        :param val_data: An object of the DataProvider type. Contains the val set.
        :param test_data: An object of the DataProvider type. Contains the test set.
        :param use_gpu: A boolean indicating whether to use a GPU or not.
        :param continue_from_epoch: An int indicating whether we'll start from scrach (-1) or whether we'll reload a previously saved model of epoch 'continue_from_epoch' and continue training from there.
        """
        super(ExperimentBuilder, self).__init__()

        self.experiment_name = experiment_name
        self.model = network_model
        # self.model.reset_parameters()
        self.device = torch.cuda.current_device()

        if torch.cuda.device_count() > 1 and use_gpu:
            self.device = torch.cuda.current_device()
            self.model.to(self.device)
            self.model = nn.DataParallel(module=self.model)
            print('Use Multi GPU', self.device)
        elif torch.cuda.device_count() == 1 and use_gpu:
            self.device = torch.cuda.current_device()
            self.model.to(
                self.device)  # sends the model from the cpu to the gpu
            print('Use GPU', self.device)
        else:
            print("use CPU")
            self.device = torch.device('cpu')  # sets the device to be CPU
            print(self.device)

        # re-initialize network parameters
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data

        if optimiser is None or optimiser == 'Adam':
            self.optimizer = Adam(self.parameters(),
                                  amsgrad=False,
                                  weight_decay=optim_params['weight_decay'],
                                  lr=sched_params['lr_max'])
        elif optimiser == 'SGD':
            self.optimizer = SGD(self.parameters(),
                                 lr=sched_params['lr_max'],
                                 momentum=optim_params['momentum'],
                                 nesterov=optim_params['nesterov'],
                                 weight_decay=optim_params['weight_decay'])

        if scheduler == 'ERF':
            self.scheduler = ERF(self.optimizer,
                                 min_lr=sched_params['lr_min'],
                                 alpha=sched_params['erf_alpha'],
                                 beta=sched_params['erf_beta'],
                                 epochs=num_epochs)
        elif scheduler == 'Step':
            self.scheduler = MultiStepLR(self.optimizer,
                                         milestones=[30, 60],
                                         gamma=0.1)
        elif scheduler == 'Cos':
            self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
                self.optimizer, T_max=num_epochs, eta_min=0.00001)
        else:
            self.scheduler = None

        print('System learnable parameters')
        num_conv_layers = 0
        num_linear_layers = 0
        total_num_parameters = 0
        for name, value in self.named_parameters():
            print(name, value.shape)
            if all(item in name for item in ['conv', 'weight']):
                num_conv_layers += 1
            if all(item in name for item in ['linear', 'weight']):
                num_linear_layers += 1
            total_num_parameters += np.prod(value.shape)

        print('Total number of parameters', total_num_parameters)
        print('Total number of conv layers', num_conv_layers)
        print('Total number of linear layers', num_linear_layers)

        # Generate the directory names
        self.experiment_folder = os.path.abspath(experiment_name)
        self.experiment_logs = os.path.abspath(
            os.path.join(self.experiment_folder, "result_outputs"))
        self.experiment_saved_models = os.path.abspath(
            os.path.join(self.experiment_folder, "saved_models"))
        print(self.experiment_folder, self.experiment_logs)
        # Set best models to be at 0 since we are just starting
        self.best_val_model_idx = 0
        self.best_val_model_acc = 0.
        self.best_train_loss = math.inf

        if not os.path.exists(self.experiment_folder
                              ):  # If experiment directory does not exist
            os.mkdir(self.experiment_folder)  # create the experiment directory

        if not os.path.exists(self.experiment_logs):
            os.mkdir(
                self.experiment_logs)  # create the experiment log directory

        if not os.path.exists(self.experiment_saved_models):
            os.mkdir(self.experiment_saved_models
                     )  # create the experiment saved models directory

        self.num_epochs = num_epochs
        self.criterion = nn.MSELoss().to(
            self.device)  # send the loss computation to the GPU
        if continue_from_epoch == -2:
            try:
                self.best_val_model_idx, self.best_val_model_acc, self.state = self.load_model(
                    model_save_dir=self.experiment_saved_models,
                    model_save_name="train_model",
                    model_idx='latest'
                )  # reload existing model from epoch and return best val model index
                # and the best val acc of that model
                self.starting_epoch = self.state['current_epoch_idx']
            except:
                print(
                    "Model objects cannot be found, initializing a new model and starting from scratch"
                )
                self.starting_epoch = 0
                self.state = dict()

        elif continue_from_epoch != -1:  # if continue from epoch is not -1 then
            self.best_val_model_idx, self.best_val_model_acc, self.state = self.load_model(
                model_save_dir=self.experiment_saved_models,
                model_save_name="train_model",
                model_idx=continue_from_epoch
            )  # reload existing model from epoch and return best val model index
            # and the best val acc of that model
            self.starting_epoch = self.state['current_epoch_idx']
        else:
            self.starting_epoch = 0
            self.state = dict()

        if pretrained_weights_locations is not None:
            self.load_pre_trained_model(
                model_save_dir=pretrained_weights_locations,
                model_save_name="train_model",
                model_idx='best')

    def load_pre_trained_model(self, model_save_dir, model_save_name,
                               model_idx):
        state = torch.load(f=os.path.join(
            model_save_dir, "{}_{}".format(model_save_name, str(model_idx))))

        self.load_state_dict(state_dict=state['network'])

    def get_num_parameters(self):
        total_num_params = 0
        for param in self.parameters():
            total_num_params += np.prod(param.shape)

        return total_num_params

    def run_train_iter(self, image, image_with_noise):
        """
        Receives the inputs and targets for the model and runs a training iteration. Returns loss and accuracy metrics.
        :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width
        :param y: The targets for the model. A numpy array of shape batch_size, num_classes
        :return: the loss and accuracy for this batch
        """
        self.train(
        )  # sets model to training mode (in case batch normalization or other methods have different procedures for training and evaluation)

        image = image.to(self.device)
        image_with_noise = image_with_noise.to(self.device)

        out = self.model.forward(
            image_with_noise)  # forward the data in the model

        loss = self.criterion(input=out, target=image)  # compute loss

        self.optimizer.zero_grad(
        )  # set all weight grads from previous training iters to 0
        loss.backward(
        )  # backpropagate to compute gradients for current iter loss

        self.optimizer.step()  # update network parameters
        _, predicted = torch.max(out.data, 1)  # get argmax of predictions
        # accuracy = np.mean(list(predicted.eq(y.data).cpu()))  # compute accuracy

        for n, p in self.model.named_parameters():
            if (p.requires_grad) and ("bias" not in n):
                if p.abs().max() < 10**(-30):
                    raise Exception('Weights smaller than 10e-30')

        return loss.data.detach().cpu().numpy(), 0

    def run_evaluation_iter(self, x, y):
        """
        Receives the inputs and targets for the model and runs an evaluation iterations. Returns loss and accuracy metrics.
        :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width
        :param y: The targets for the model. A numpy array of shape batch_size, num_classes
        :return: the loss and accuracy for this batch
        """
        self.eval()  # sets the system to validation mode
        if len(y.shape) > 1:
            y = np.argmax(
                y, axis=1
            )  # convert one hot encoded labels to single integer labels
        if type(x) is np.ndarray:
            x, y = torch.Tensor(x).float(
            ).to(device=self.device), torch.Tensor(y).long().to(
                device=self.device
            )  # convert data to pytorch tensors and send to the computation device

        x = x.to(self.device)
        y = y.to(self.device)
        out = self.model.forward(x)  # forward the data in the model
        loss = F.cross_entropy(out, y)  # compute loss
        _, predicted = torch.max(out.data, 1)  # get argmax of predictions
        accuracy = np.mean(list(predicted.eq(
            y.data).cpu()))  # compute accuracy

        y_cpu = y.data.cpu()
        predicted_cpu = predicted.cpu()

        f1 = f1_score(y_cpu, predicted_cpu, average='macro')
        precision = precision_score(y_cpu, predicted_cpu, average='macro')
        recall = recall_score(y_cpu, predicted_cpu, average='macro')

        return loss.data.detach().cpu().numpy(
        ), accuracy, f1, precision, recall

    def save_model(self, model_save_dir, model_save_name, model_idx, state):
        """
        Save the network parameter state and current best val epoch idx and best val accuracy.
        :param model_save_name: Name to use to save model without the epoch index
        :param model_idx: The index to save the model with.
        :param best_validation_model_idx: The index of the best validation model to be stored for future use.
        :param best_validation_model_acc: The best validation accuracy to be stored for use at test time.
        :param model_save_dir: The directory to store the state at.
        :param state: The dictionary containing the system state.

        """
        state['network'] = self.state_dict(
        )  # save network parameter and other variables.
        torch.save(
            state,
            f=os.path.join(model_save_dir, "{}_{}".format(
                model_save_name,
                str(model_idx))))  # save state at prespecified filepath

    def run_training_epoch(self, current_epoch_losses):
        with tqdm.tqdm(total=len(self.train_data), file=sys.stdout
                       ) as pbar_train:  # create a progress bar for training
            for idx, (image, image_with_noise) in enumerate(
                    self.train_data):  # get data batches
                loss, accuracy = self.run_train_iter(
                    image=image, image_with_noise=image_with_noise
                )  # take a training iter step
                current_epoch_losses["train_loss"].append(
                    loss)  # add current iter loss to the train loss list
                current_epoch_losses["train_acc"].append(
                    accuracy)  # add current iter acc to the train acc list
                pbar_train.update(1)
                pbar_train.set_description(
                    "loss: {:.4f}, accuracy: {:.4f}".format(loss, accuracy))

        return current_epoch_losses

    def run_validation_epoch(self, current_epoch_losses):

        with tqdm.tqdm(total=len(self.val_data), file=sys.stdout
                       ) as pbar_val:  # create a progress bar for validation
            for x, y in self.val_data:  # get data batches
                loss, accuracy, f1, precision, recall = self.run_evaluation_iter(
                    x=x, y=y)  # run a validation iter

                current_epoch_losses["val_loss"].append(
                    loss)  # add current iter loss to val loss list.
                current_epoch_losses["val_acc"].append(
                    accuracy)  # add current iter acc to val acc lst.
                current_epoch_losses["val_f1"].append(f1)
                current_epoch_losses["val_precision"].append(precision)
                current_epoch_losses["val_recall"].append(recall)

                pbar_val.update(1)  # add 1 step to the progress bar
                pbar_val.set_description(
                    "loss: {:.4f}, accuracy: {:.4f}".format(loss, accuracy))

        return current_epoch_losses

    def run_testing_epoch(self, current_epoch_losses):

        with tqdm.tqdm(total=len(self.test_data),
                       file=sys.stdout) as pbar_test:  # ini a progress bar
            for x, y in self.test_data:  # sample batch
                # compute loss and accuracy by running an evaluation step
                loss, accuracy, f1, precision, recall = self.run_evaluation_iter(
                    x=x, y=y)

                current_epoch_losses["test_loss"].append(
                    loss)  # save test loss
                current_epoch_losses["test_acc"].append(
                    accuracy)  # save test accuracy
                current_epoch_losses["test_f1"].append(f1)
                current_epoch_losses["test_precision"].append(precision)
                current_epoch_losses["test_recall"].append(recall)

                pbar_test.update(1)  # update progress bar status
                pbar_test.set_description(
                    "loss: {:.4f}, accuracy: {:.4f}".format(
                        loss, accuracy))  # update progress bar string output
        return current_epoch_losses

    def load_model(self, model_save_dir, model_save_name, model_idx):
        """
        Load the network parameter state and the best val model idx and best val acc to be compared with the future val accuracies, in order to choose the best val model
        :param model_save_dir: The directory to store the state at.
        :param model_save_name: Name to use to save model without the epoch index
        :param model_idx: The index to save the model with.
        :return: best val idx and best val model acc, also it loads the network state into the system state without returning it
        """
        state = torch.load(f=os.path.join(
            model_save_dir, "{}_{}".format(model_save_name, str(model_idx))))
        self.load_state_dict(state_dict=state['network'])
        return state['best_val_model_idx'], state['best_val_model_acc'], state

    def run_experiment(self):
        """
        Runs experiment train and evaluation iterations, saving the model and best val model and val model accuracy after each epoch
        :return: The summary current_epoch_losses from starting epoch to total_epochs.
        """
        total_losses = {
            "train_acc": [],
            "train_loss": [],
            "val_acc": [],
            "val_loss": [],
            "val_f1": [],
            "val_precision": [],
            "val_recall": [],
            "curr_epoch": []
        }  # initialize a dict to keep the per-epoch metrics

        for i, epoch_idx in enumerate(
                range(self.starting_epoch, self.num_epochs)):
            epoch_start_time = time.time()
            current_epoch_losses = {
                "train_acc": [],
                "train_loss": [],
                "val_acc": [],
                "val_loss": [],
                "val_f1": [],
                "val_precision": [],
                "val_recall": []
            }

            current_epoch_losses = self.run_training_epoch(
                current_epoch_losses)

            if self.scheduler is not None:
                self.scheduler.step()

            train_loss_average = np.mean(current_epoch_losses['train_loss'])

            if train_loss_average < self.best_train_loss:
                print(f'Saving Best Model')
                self.best_train_loss = train_loss_average
                self.save_model(
                    model_save_dir=self.experiment_saved_models,
                    # save model and best val idx and best val acc, using the model dir, model name and model idx
                    model_save_name="train_model",
                    model_idx='best',
                    state=self.state)

            for key, value in current_epoch_losses.items():
                total_losses[key].append(np.mean(value))
                # get mean of all metrics of current epoch metrics dict,
                # to get them ready for storage and output on the terminal.

            total_losses['curr_epoch'].append(epoch_idx)
            save_statistics(experiment_log_dir=self.experiment_logs,
                            filename='summary.csv',
                            stats_dict=total_losses,
                            current_epoch=i,
                            continue_from_mode=True if
                            (self.starting_epoch != 0 or i > 0) else
                            False)  # save statistics to stats file.

            # load_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv') # How to load a csv file if you need to

            out_string = "_".join([
                "{}_{:.4f}".format(key, np.mean(value))
                for key, value in current_epoch_losses.items()
            ])
            # create a string to use to report our epoch metrics
            epoch_elapsed_time = time.time(
            ) - epoch_start_time  # calculate time taken for epoch
            epoch_elapsed_time = "{:.4f}".format(epoch_elapsed_time)
            print("Epoch {}:".format(epoch_idx), out_string, "epoch time",
                  epoch_elapsed_time, "seconds")
            self.state['current_epoch_idx'] = epoch_idx

        return total_losses
    ax1.plot(np.arange(epoch + 1), loss[0], '-y', label='ste-model loss')
    ax1.plot(np.arange(epoch + 1), loss[1], '-r', label='discriminator loss')
    ax2.plot(np.arange(epoch + 1), acc[0], '-g', label='real_acc')
    ax2.plot(np.arange(epoch + 1), acc[1], '-b', label='wm_acc')

    ax1.set_xlabel('Epoch(' + ",".join(str(l)
                                       for l in args.hyper_parameters) + ')')
    ax1.set_ylabel('Train Loss')
    ax2.set_ylabel('Accuracy (%)')

    ax1.set_ylim(0, 5)
    ax2.set_ylim(0, 100)

    ax1.legend(loc=1)
    ax2.legend(loc=2)
    if train:
        plt.savefig(args.save_path + 'results_train_' + GPU + '.png')
    else:
        plt.savefig(args.save_path + 'results_test_' + GPU + '.png')
    plt.close()


for epoch in range(args.num_epochs):
    train(epoch)
    val_hloss, val_disloss, val_dnnloss, acc, wm_acc, wm_inut_acc = test(epoch)
    schedulerH.step(val_hloss)
    schedulerD.step(val_disloss)
    schedulerN.step()
print(acc, wm_acc, wm_inut_acc)
def main(args: argparse.Namespace):
    logger = CompleteLogger(args.log, args.phase)
    print(args)

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    cudnn.benchmark = True

    # Data loading code
    normalize = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    train_transform = T.Compose([
        T.RandomRotation(args.rotation),
        T.RandomResizedCrop(size=args.image_size, scale=args.resize_scale),
        T.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25),
        T.GaussianBlur(),
        T.ToTensor(), normalize
    ])
    val_transform = T.Compose(
        [T.Resize(args.image_size),
         T.ToTensor(), normalize])
    image_size = (args.image_size, args.image_size)
    heatmap_size = (args.heatmap_size, args.heatmap_size)
    source_dataset = datasets.__dict__[args.source]
    train_source_dataset = source_dataset(root=args.source_root,
                                          transforms=train_transform,
                                          image_size=image_size,
                                          heatmap_size=heatmap_size)
    train_source_loader = DataLoader(train_source_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.workers,
                                     pin_memory=True,
                                     drop_last=True)
    val_source_dataset = source_dataset(root=args.source_root,
                                        split='test',
                                        transforms=val_transform,
                                        image_size=image_size,
                                        heatmap_size=heatmap_size)
    val_source_loader = DataLoader(val_source_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=False,
                                   pin_memory=True)

    target_dataset = datasets.__dict__[args.target]
    train_target_dataset = target_dataset(root=args.target_root,
                                          transforms=train_transform,
                                          image_size=image_size,
                                          heatmap_size=heatmap_size)
    train_target_loader = DataLoader(train_target_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.workers,
                                     pin_memory=True,
                                     drop_last=True)
    val_target_dataset = target_dataset(root=args.target_root,
                                        split='test',
                                        transforms=val_transform,
                                        image_size=image_size,
                                        heatmap_size=heatmap_size)
    val_target_loader = DataLoader(val_target_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=False,
                                   pin_memory=True)

    print("Source train:", len(train_source_loader))
    print("Target train:", len(train_target_loader))
    print("Source test:", len(val_source_loader))
    print("Target test:", len(val_target_loader))

    train_source_iter = ForeverDataIterator(train_source_loader)
    train_target_iter = ForeverDataIterator(train_target_loader)

    # create model
    model = models.__dict__[args.arch](
        num_keypoints=train_source_dataset.num_keypoints).to(device)
    criterion = JointsMSELoss()

    # define optimizer and lr scheduler
    optimizer = Adam(model.get_parameters(lr=args.lr))
    lr_scheduler = MultiStepLR(optimizer, args.lr_step, args.lr_factor)

    # optionally resume from a checkpoint
    start_epoch = 0
    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        start_epoch = checkpoint['epoch'] + 1

    # define visualization function
    tensor_to_image = Compose([
        Denormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ToPILImage()
    ])

    def visualize(image, keypoint2d, name):
        """
        Args:
            image (tensor): image in shape 3 x H x W
            keypoint2d (tensor): keypoints in shape K x 2
            name: name of the saving image
        """
        train_source_dataset.visualize(
            tensor_to_image(image), keypoint2d,
            logger.get_image_path("{}.jpg".format(name)))

    if args.phase == 'test':
        # evaluate on validation set
        source_val_acc = validate(val_source_loader, model, criterion, None,
                                  args)
        target_val_acc = validate(val_target_loader, model, criterion,
                                  visualize, args)
        print("Source: {:4.3f} Target: {:4.3f}".format(source_val_acc['all'],
                                                       target_val_acc['all']))
        for name, acc in target_val_acc.items():
            print("{}: {:4.3f}".format(name, acc))
        return

    # start training
    best_acc = 0
    for epoch in range(start_epoch, args.epochs):
        logger.set_epoch(epoch)
        lr_scheduler.step()

        # train for one epoch
        train(train_source_iter, train_target_iter, model, criterion,
              optimizer, epoch, visualize if args.debug else None, args)

        # evaluate on validation set
        source_val_acc = validate(val_source_loader, model, criterion, None,
                                  args)
        target_val_acc = validate(val_target_loader, model, criterion,
                                  visualize if args.debug else None, args)

        # remember best acc and save checkpoint
        torch.save(
            {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch,
                'args': args
            }, logger.get_checkpoint_path(epoch))
        if target_val_acc['all'] > best_acc:
            shutil.copy(logger.get_checkpoint_path(epoch),
                        logger.get_checkpoint_path('best'))
            best_acc = target_val_acc['all']
        print("Source: {:4.3f} Target: {:4.3f} Target(best): {:4.3f}".format(
            source_val_acc['all'], target_val_acc['all'], best_acc))
        for name, acc in target_val_acc.items():
            print("{}: {:4.3f}".format(name, acc))

    logger.close()
Example #19
0
def main_worker(args):
    global best_acc1
    global best_acc1_index
    # os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus

    args.outpath = args.outpath + '_' + args.arch
    output_process(args.outpath)
    write_settings(args)
    logger = get_logger(args.outpath, 'DataParallel')
    writer = SummaryWriter(args.outpath)
    logger.info(args)

    # create model
    if args.pretrained:
        logger.info("=> using pre-trained model: {}".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        logger.info('=> creating model: {}'.format(args.arch))
        model = models.__dict__[args.arch]()

    model = nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    if args.lr_scheduler == 'steplr':
        lr_scheduler = MultiStepLR(optimizer, milestones=args.step, gamma=args.gamma)
        logger.info('lr_scheduler: SGD MultiStepLR !!!')
    else:
        assert False, logger.info("invalid lr_scheduler={}".format(args.lr_scheduler))
    # logger.info('lr_scheduler={}'.format(lr_scheduler))

    # dataloader
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(traindir, transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True,
                              num_workers=args.workers, pin_memory=True)

    val_dataset = datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ]))
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False,
                              num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args, logger, writer, epoch=-1)
        return 0

    total_start = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        epoch_start = time.time()
        lr_scheduler.step(epoch)

        # train for every epoch
        train(train_loader, model, criterion, optimizer, epoch, args, logger, writer)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args, logger, writer, epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        if is_best:
            best_acc1_index = epoch
            best_acc1 = acc1

        epoch_end = time.time()
        logger.info('||==> Epoch=[{:d}/{:d}]\tbest_acc1={:.4f}\tbest_acc1_index={}\ttime_cost={:.4f}s'
                    .format(epoch, args.epochs, best_acc1, best_acc1_index, epoch_end - epoch_start))

        # save model
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.module.state_dict(),
                'best_acc1': best_acc1,
            }, is_best, args.outpath)

    total_end = time.time()
    logger.info('||==> total_time_cost={:.4f}s'.format(total_end - total_start))
    writer.close()
Example #20
0
def main():
    args = parser.parse_args()
    model_name = args.name
    writer = SummaryWriter(pathlib.Path(args.eventdir)/ model_name)
    # Create a pytorch dataset
    data_dir = pathlib.Path(args.datadir)
    image_count = len(list(data_dir.glob('**/*.JPEG')))
    CLASS_NAMES = np.array([item.name for item in (data_dir / 'train').glob('*')])
    print('Discovered {} images'.format(image_count))

    # Create the training data generator
    batch_size = args.batch_size
    im_height = 64
    im_width = 64
    num_epochs = args.epochs
    mean = [0.4802, 0.4481, 0.3975]
    std = [0.2296, 0.2263, 0.2255]

    val_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    train_transform = transforms.Compose(
      [transforms.RandomHorizontalFlip()])
    preprocess = transforms.Compose(
      [transforms.ToTensor(),
       transforms.Normalize(mean, std)])

    train_set = torchvision.datasets.ImageFolder(data_dir / 'train', train_transform)
    val_set = Val_Dataset(train_transform,train_set.class_to_idx)
    full_train_set = torch.utils.data.ConcatDataset([train_set, val_set])

    am_train_set = AugMixDataset(full_train_set, preprocess,augmentations.augmentations_all)
    train_loader = torch.utils.data.DataLoader(am_train_set, batch_size=batch_size,
                                               shuffle=True, num_workers=4, pin_memory=True)


    # Create a simple modeli

    lr = args.lr
    mom = args.momentum
    wd = args.weight_decay

    model = None
    if args.model_name == 'vgg16_slim':
        model = vgg_slim.vgg16_slim().cuda()
    elif args.model_name == 'vgg16':
        model = vgg_slim.vgg16().cuda()
    elif args.model_name == 'efficientnet-b0':
        model = EfficientNet.from_name('efficientnet-b0').cuda()
    elif args.model_name == 'resnet-18':
        model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=False).cuda()

    else:
        print("Unknown model name: {}".format(args.model_name))

    optim = torch.optim.SGD(model.parameters(),lr=lr, momentum=mom, weight_decay=wd)


    # !!!!!!!!!!!! TODO: put the right milestones in here: !!!!!!!!!!!!
    epoch_milestones = [] # Sets the epochs at which the LR will decay by a factor of 0.1


    sched = MultiStepLR(optim, epoch_milestones)

    print(get_n_params(model))
    criterion = nn.CrossEntropyLoss()
    for i in range(num_epochs):
        model.train()

        train_total, train_correct = 0,0
        running_loss = 0.0
        for idx, (inputs, targets) in enumerate(train_loader):
            split_s = inputs[0].size(0)
            if(args.augmix_off):
                inputs = inputs[0].cuda()
            else:
                inputs = torch.cat(inputs,0).cuda()
            targets = targets.cuda()
            optim.zero_grad()
            outputs = model(inputs)
            if(args.augmix_off):
                logits_clean = outputs
            else:
                logits_clean, logits_aug1, logits_aug2 = torch.split(outputs, split_s)
            loss = criterion(logits_clean, targets)
            if(not args.augmix_off):
                p_clean = F.softmax(logits_clean, dim=1)
                p_aug1 = F.softmax(logits_aug1, dim=1)
                p_aug2 = F.softmax(logits_aug2, dim=1)
                p_mixture = torch.clamp((p_clean + p_aug1 + p_aug2) / 3., 1e-7, 1).log()
                loss += 12 * (F.kl_div(p_mixture, p_clean, reduction='batchmean') +
                  F.kl_div(p_mixture, p_aug1, reduction='batchmean') +
                  F.kl_div(p_mixture, p_aug2, reduction='batchmean')) / 3
            running_loss += loss.item()
            loss.backward()
            optim.step()
            _, predicted = logits_clean.max(1)
            train_total += targets.size(0)
            train_correct += predicted.eq(targets).sum().item()
            print("\r", end='')
            print(f'training {100 * idx / len(train_loader):.2f}%: {train_correct / train_total:.3f}', end='')

        torch.save({
            'net': model.state_dict(),
        }, args.savedir + '/' + model_name +'_e' + str(i) +'.pt')

        writer.add_scalar('Train Accuracy', float(train_correct)/float(train_total),i)
        writer.add_scalar('Train Loss', running_loss, i)

        sched.step()

    writer.close()
Example #21
0
                    print('Validation loss of last epoch: %f' % (Validation_loss[-1]))

                recon_loss_sum, kl_loss_sum = 0, 0

            qsms = (qsms.to(device, dtype=torch.float) + trans) * scale
            masks = masks.to(device, dtype=torch.float)
            qsms = qsms * masks

            recon_loss, kl_loss = vae_train(model=vae3d, optimizer=optimizer, x=qsms, mask=masks)
            recon_loss_sum += recon_loss
            kl_loss_sum += kl_loss
            gen_iterations += 1

            time.sleep(1)

        scheduler.step(epoch)

        # validation phase
        vae3d.eval()
        loss_total = 0
        idx = 0
        with torch.no_grad():  # to solve memory exploration issue
            for idx, (rdfs, masks, weights, qsms) in enumerate(valLoader):
                idx += 1
                qsms = (qsms.to(device, dtype=torch.float) + trans) * scale
                masks = masks.to(device, dtype=torch.float)
                qsms = qsms * masks
                
                x_mu, x_var, z_mu, z_logvar = vae3d(qsms)
                x_factor = torch.prod(torch.tensor(x_mu.size()))
                z_factor = torch.prod(torch.tensor(z_mu.size()))    
Example #22
0
def main_worker(gpu, ngpus_per_node, cfg):
    if cfg.gpu is not None:
        print("Use GPU: {} for training".format(cfg.gpu))

    if cfg.distributed:
        print('init distributing process')
        if cfg.dist_url == "env://" and cfg.rank == -1:
            cfg.rank = int(os.environ["RANK"])
        dist.init_process_group(backend=cfg.dist_backend,
                                init_method=cfg.dist_url,
                                world_size=cfg.world_size,
                                rank=cfg.rank)

    # Data
    print('==> Preparing data..')
    # Load vocabulary wrapper for image caption
    with open(cfg.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # Image preprocessing, normalization for the pretrained resnet
    # cifar cls, use resized 36x36 image
    if cfg.task == 'cifar_cls':
        transform = transforms.Compose([
            transforms.RandomCrop(cfg.crop_size, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    # imagenet cls, 224x224
    # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978
    if cfg.task == 'imagenet_cls':
        transform = transforms.Compose([
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    # coco det, 1333x800
    # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978
    if cfg.task == 'coco_det':
        transform = transforms.Compose([
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    # COCO caption dataset
    coco = CocoDataset(root=cfg.image_dir,
                       json=cfg.caption_path,
                       vocab=vocab,
                       transform=transform)
    #Build data loader for image caption training
    if cfg.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(coco)
    else:
        train_sampler = None

    # Data loader for COCO dataset
    # This will return (images, captions, lengths) for each iteration.
    # images: a tensor of shape (batch_size, 3, 224, 224).
    # captions: a tensor of shape (batch_size, padded_length).
    # lengths: a list indicating valid length for each caption. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=coco,
                                              batch_size=cfg.batch_size,
                                              shuffle=(train_sampler is None),
                                              num_workers=cfg.num_workers,
                                              collate_fn=collate_fn,
                                              pin_memory=True,
                                              sampler=train_sampler)

    # Build the Decoder models
    decoder = DecoderRNN(cfg.model['embed_size'], cfg.model['hidden_size'],
                         len(vocab), cfg.model['num_layers'])

    if cfg.model['net'] == 'densenet121':
        linear_ic = nn.Linear(1024, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = DenseNet121()

    if cfg.model['net'] == 'densenet169':
        linear_ic = nn.Linear(4096, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = DenseNet169()

    if cfg.model['net'] == 'resnet34':
        linear_ic = nn.Linear(512, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = ResNet34()

    if cfg.model['net'] == 'resnet50':
        linear_ic = nn.Linear(2048, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = ResNet50()

    if cfg.model['net'] == 'resnet101':
        linear_ic = nn.Linear(2048, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = ResNet101()

    print('cfg.distributed:', cfg.distributed)
    if cfg.distributed:
        linear_ic.cuda()
        bn_ic.cuda()
        net.cuda()
        decoder.cuda()
        # DistributedDataParallel will divide and allocate batch_size to all
        # available GPUs if device_ids are not set
        linear_ic = torch.nn.parallel.DistributedDataParallel(linear_ic)
        bn_ic = torch.nn.parallel.DistributedDataParallel(bn_ic)
        net = torch.nn.parallel.DistributedDataParallel(net)
        decoder = torch.nn.parallel.DistributedDataParallel(decoder)
    else:
        torch.cuda.set_device(device)
        linear_ic.cuda(cfg.gpu)
        bn_ic.cuda(cfg.gpu)
        net.cuda(cfg.gpu)
        decoder.cuda(cfg.gpu)

    criterion = nn.CrossEntropyLoss()
    # Optimizer for image classificaation
    # optimizer = optim.Adam(list(net.parameters()), lr=cfg.lr)

    optimizer_ic = optim.Adam(
        list(net.parameters()) + list(linear_ic.parameters()) +
        list(decoder.parameters()) + list(bn_ic.parameters()),
        lr=cfg.lr)  #0.0001
    scheduler = MultiStepLR(optimizer_ic, milestones=[60, 120, 160], gamma=0.1)

    if cfg.loading:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        # assert os.path.isdir(cfg.checkpoint), 'Error: no checkpoint directory found!'
        checkpoint = torch.load(cfg.checkpoint)
        net.load_state_dict(checkpoint)
        # best_acc = checkpoint['acc']
        start_epoch = int(cfg.checkpoint.split('/')[-1].split('-')[1])
    else:
        start_epoch = 0

    #scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_ic, T_max=200)
    log_dir = 'log/' + cfg.config.split('/')[1][:-3]
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    writer = SummaryWriter(log_dir=log_dir)
    #start training
    for epoch in range(start_epoch, cfg.num_epochs):
        if cfg.distributed:
            train_sampler.set_epoch(epoch)
        net = train_ic(epoch,
                       cfg,
                       net=net,
                       decoder=decoder,
                       linear=linear_ic,
                       bn=bn_ic,
                       optimizer_ic=optimizer_ic,
                       criterion=criterion,
                       data_loader=data_loader,
                       writer=writer)
        scheduler.step()
Example #23
0
def train(model,
          train_dataset,
          test_dataset=None,
          model_dir='models',
          lr=1e-04,
          lr_decay=.1,
          lr_decay_epochs=None,
          weight_decay=1e-04,
          gamma1=1.,
          gamma2=1.,
          gamma3=10.,
          batch_size=32,
          test_size=256,
          epochs=5,
          eval_log_interval=30,
          loss_log_interval=30,
          weight_log_interval=500,
          checkpoint_interval=500,
          resume_best=False,
          resume_latest=False,
          cuda=False):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=lr,
                           weight_decay=weight_decay)
    scheduler = MultiStepLR(optimizer, lr_decay_epochs, gamma=lr_decay)

    # prepare the model and statistics.
    model.train()
    epoch_start = 1
    best_precision = 0

    # load checkpoint if needed.
    if resume_latest or resume_best:
        epoch_start, best_precision = utils.load_checkpoint(model,
                                                            model_dir,
                                                            best=resume_best)

    for epoch in range(epoch_start, epochs + 1):
        # adjust learning rate if needed.
        scheduler.step(epoch - 1)

        # prepare a data stream for the epoch.
        data_loader = utils.get_data_loader(train_dataset,
                                            batch_size,
                                            cuda=cuda)
        data_stream = tqdm(enumerate(data_loader, 1))

        for batch_index, (data, labels) in data_stream:
            # where are we?
            data_size = len(data)
            dataset_size = len(data_loader.dataset)
            dataset_batches = len(data_loader)
            iteration = ((epoch - 1) *
                         (len(data_loader.dataset) // batch_size) +
                         batch_index + 1)

            # clear the gradients.
            optimizer.zero_grad()

            # run the network.
            x = Variable(data).cuda() if cuda else Variable(data)
            labels = Variable(labels).cuda() if cuda else Variable(labels)
            scores = model(x)
            _, predicted = scores.max(1)
            precision = (labels == predicted).sum().data[0] / data_size

            # update the network.
            cross_entropy_loss = criterion(scores, labels)
            overlap_loss, uniform_loss, split_loss = model.reg_loss()
            overlap_loss *= gamma1
            uniform_loss *= gamma3
            split_loss *= gamma2
            reg_loss = overlap_loss + uniform_loss + split_loss

            total_loss = cross_entropy_loss + reg_loss
            total_loss.backward(retain_graph=True)
            optimizer.step()

            # update & display statistics.
            data_stream.set_description(
                ('epoch: {epoch}/{epochs} | '
                 'it: {iteration} | '
                 'progress: [{trained}/{total}] ({progress:.0f}%) | '
                 'prec: {prec:.3} | '
                 'loss => '
                 'ce: {ce_loss:.4} / '
                 'reg: {reg_loss:.4} / '
                 'total: {total_loss:.4}').format(
                     epoch=epoch,
                     epochs=epochs,
                     iteration=iteration,
                     trained=(batch_index + 1) * batch_size,
                     total=dataset_size,
                     progress=(100. * (batch_index + 1) / dataset_batches),
                     prec=precision,
                     ce_loss=(cross_entropy_loss.data[0] / data_size),
                     reg_loss=(reg_loss.data[0] / data_size),
                     total_loss=(total_loss.data[0] / data_size),
                 ))

            # Send test precision to the visdom server.
            if iteration % eval_log_interval == 0:
                visual.visualize_scalar(utils.validate(model,
                                                       test_dataset,
                                                       test_size=test_size,
                                                       cuda=cuda,
                                                       verbose=False),
                                        'precision',
                                        iteration,
                                        env=model.name)

            # Send losses to the visdom server.
            if iteration % loss_log_interval == 0:
                reg_losses_and_names = ([
                    overlap_loss.data / data_size,
                    uniform_loss.data / data_size,
                    split_loss.data / data_size,
                    reg_loss.data / data_size,
                ], ['overlap', 'uniform', 'split', 'total'])

                visual.visualize_scalar(overlap_loss.data / data_size,
                                        'overlap loss',
                                        iteration,
                                        env=model.name)
                visual.visualize_scalar(uniform_loss.data / data_size,
                                        'uniform loss',
                                        iteration,
                                        env=model.name)
                visual.visualize_scalar(split_loss.data / data_size,
                                        'split loss',
                                        iteration,
                                        env=model.name)
                visual.visualize_scalars(*reg_losses_and_names,
                                         'regulaization losses',
                                         iteration,
                                         env=model.name)

                model_losses_and_names = ([
                    cross_entropy_loss.data / data_size,
                    reg_loss.data / data_size,
                    total_loss.data / data_size,
                ], ['cross entropy', 'regularization', 'total'])

                visual.visualize_scalar(cross_entropy_loss.data / data_size,
                                        'cross entropy loss',
                                        iteration,
                                        env=model.name)

                visual.visualize_scalar(reg_loss.data / data_size,
                                        'regularization loss',
                                        iteration,
                                        env=model.name)

                visual.visualize_scalars(*model_losses_and_names,
                                         'model losses',
                                         iteration,
                                         env=model.name)

            if iteration % weight_log_interval == 0:
                # Send visualized weights to the visdom server.
                weights = [
                    (w.data, p, q)
                    for i, g in enumerate(model.residual_block_groups)
                    for b in g.residual_blocks for w, p, q in (
                        (b.w1, b.p(), b.r()),
                        (b.w2, b.r(), b.q()),
                        (b.w3, b.p(), b.q()),
                    )
                    if i + 1 > (len(model.residual_block_groups) -
                                (len(model.split_sizes) - 1)) and w is not None
                ] + [(model.fc.linear.weight.data, model.fc.p(), model.fc.q())]

                names = [
                    'g{i}-b{j}-w{k}'.format(i=i + 1, j=j + 1, k=k + 1)
                    for i, g in enumerate(model.residual_block_groups)
                    for j, b in enumerate(g.residual_blocks)
                    for k, w in enumerate((b.w1, b.w2, b.w3))
                    if i + 1 > (len(model.residual_block_groups) -
                                (len(model.split_sizes) - 1)) and w is not None
                ] + ['fc-w']

                for (w, p, q), name in zip(weights, names):
                    visual.visualize_kernel(
                        splits.block_diagonalize_kernel(w, p, q),
                        name,
                        label='epoch{}-{}'.format(epoch, batch_index + 1),
                        update_window_without_label=True,
                        env=model.name,
                    )

                # Send visualized split indicators to the visdom server.
                indicators = [
                    q.data for i, g in enumerate(model.residual_block_groups)
                    for j, b in enumerate(g.residual_blocks)
                    for k, q in enumerate((b.p(), b.r())) if q is not None
                ] + [model.fc.p().data, model.fc.q().data]

                names = [
                    'g{i}-b{j}-{indicator}'.format(
                        i=i + 1, j=j + 1, indicator=ind)
                    for i, g in enumerate(model.residual_block_groups)
                    for j, b in enumerate(g.residual_blocks)
                    for ind, q in zip(('p', 'r'), (b.p(), b.r()))
                    if q is not None
                ] + ['fc-p', 'fc-q']

                for q, name in zip(indicators, names):
                    # Stretch the split indicators before visualization.
                    q_diagonalized = splits.block_diagonalize_indacator(q)
                    q_diagonalized_expanded = q_diagonalized\
                        .view(*q.size(), 1)\
                        .repeat(1, 20, 1)\
                        .view(-1, q.size()[1])

                    visual.visualize_kernel(q_diagonalized_expanded,
                                            name,
                                            label='epoch{}-{}'.format(
                                                epoch, batch_index + 1),
                                            update_window_without_label=True,
                                            env=model.name,
                                            w=100,
                                            h=100)

            if iteration % checkpoint_interval == 0:
                # notify that we've reached to a new checkpoint.
                print()
                print()
                print('#############')
                print('# checkpoint!')
                print('#############')
                print()

                # test the model.
                model_precision = utils.validate(model,
                                                 test_dataset or train_dataset,
                                                 test_size=test_size,
                                                 cuda=cuda,
                                                 verbose=True)

                # update best precision if needed.
                is_best = model_precision > best_precision
                best_precision = max(model_precision, best_precision)

                # save the checkpoint.
                utils.save_checkpoint(model,
                                      model_dir,
                                      epoch,
                                      model_precision,
                                      best=is_best)
                print()
Example #24
0
                img_mode='NHWC')
            val_loss, val_accuracy1, = TrainMethod.test(model,
                                                        val,
                                                        512 // args.batch_size,
                                                        criterion,
                                                        device,
                                                        dtype,
                                                        img_mode='NHWC')

            val_writer.add_scalar('epoch_accuracy', val_accuracy1, epoch)
            val_writer.add_scalar('epoch_loss', val_loss, epoch)
            train_writer.add_scalar('epoch_loss', train_loss, epoch)
            train_writer.add_scalar('epoch_accuracy', train_accuracy1, epoch)

            if isinstance(scheduler, ReduceLROnPlateau):
                scheduler.step(val_loss)
            elif isinstance(scheduler, MultiStepLR):
                scheduler.step()

            if val_accuracy1 > best_test:
                best_test = val_accuracy1
            # TODO:没输入进去?
            # logger.info('epoch:{}, val_loss:{:.5f}, val_accuracy:{:.5f}'.format(epoch+1, val_loss, val_accuracy1))
            save_checkpoint(
                is_best=False,
                filepath=args.save,
                filename='{}-epoch{}-val_loss{:.4f}.pth'.format(
                    args.model_name, epoch, val_loss),
                state={
                    'epoch': epoch,
                    'state_dict': model.state_dict(),
Example #25
0
def main():
    setup_information()
    landmarks_model = _model_init()
    train_loader = get_dataset()

    for p in landmarks_model.resnet.conv1.parameters():
        p.requires_grad = False
    for p in landmarks_model.resnet.bn1.parameters():
        p.requires_grad = False
    for p in landmarks_model.resnet.layer1.parameters():
        p.requires_grad = False
    #for p in landmarks_model.resnet.layer1.0.parameters(): p.requires_grad=False
    params = []
    #params = list(net.parameters())
    for p in list(landmarks_model.parameters()):
        if p.requires_grad == False: continue
        params.append(p)
    optimizer = optim.RMSprop(params, lr=LEARNING_RATE)
    scheduler = MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2)
    landmark_avg_loss = AverageMeter('landmark_avg_loss', ':.4e')
    ks_landmark_avg_loss = AverageMeter('ks_landmark_avg_loss', ':.4e')
    coeff_avg_loss = AverageMeter('coeff_avg_loss', ':.4e')
    pixel_avg_loss = AverageMeter('pixel_avg_loss', ':.4e')
    skin_mask_avg_loss = AverageMeter('skin_mask_loss', ':.4e')
    avg_loss = AverageMeter('loss', ':.4e')

    #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
    #                                 std=[0.229, 0.224, 0.225])
    #trans = transforms.Compose([
    #    transforms.ToTensor(),
    #    normalize,
    #])

    for epoch in range(MAX_EPOCH):
        epoch_time = time.time()
        scheduler.step()
        count = 0
        for i_batch, _sample_batched in enumerate(train_loader):
            _inputs = _sample_batched[
                0]  #--- batch_size x 3 x 224 x 224, bgr, 0, 255
            _bgr_images = _sample_batched[1].float(
            )  #--batch_size x 224 x 224 x 3, bgr, [0, 255]
            _mask_images = _sample_batched[2].ge(200)
            _gt_pts = _sample_batched[3]

            _inputs = _inputs / 255.
            _inputs = _inputs.sub_(mean[:, None, None]).div_(std[:, None,
                                                                 None])
            _gt_pts = _gt_pts / CROP_SIZE * image_size
            _gt_pts = _gt_pts.view(batch_size, -1, 3)

            _inputs = _inputs.to(device)
            _bgr_images = _bgr_images.to(device)
            _mask_images = _mask_images.to(device)
            _gt_pts = _gt_pts.to(device)
            _render_images = _bgr_images.clone()

            coeff = landmarks_model(_inputs)
            #-----rendering-----------------------
            id_coeff, ex_coeff, tex_coeff, angles, gamma, translation = landmarks_model.Split_coeff(
                coeff)
            coeff_loss = landmarks_model.get_coeff_loss(
                id_coeff, ex_coeff, tex_coeff)

            face_shape = landmarks_model.Shape_formation(
                id_coeff, ex_coeff, batch_size)
            face_norm = landmarks_model.Compute_norm(face_shape, batch_size)
            rotation = landmarks_model.Compute_rotation_matrix(angles)
            face_shape = torch.matmul(face_shape, rotation)  ###旋转vertex
            face_shape = face_shape + translation.view(-1, 1, 3).repeat(
                1,
                face_shape.size()[1], 1)
            norm_r = torch.matmul(face_norm, rotation)
            face_texture = landmarks_model.Texture_formation(
                tex_coeff, batch_size)
            face_color, _ = landmarks_model.Illumination_layer(
                face_texture, norm_r, gamma)

            face_color = Textures(
                verts_rgb=face_color.to(device))  #---改成pytorch3d中的Textures数据格式
            skin_mask_color = face_texture[:, landmarks_model.skinmask, :]
            skin_mask_loss = landmarks_model.get_skin_mask_loss(
                skin_mask_color)

            mesh = Meshes(face_shape.to(device), landmarks_model.face_index,
                          face_color)

            #---landmarks------
            transformed_face_shape = cameras.transform_points(face_shape)
            landmarks = transformed_face_shape[:, landmarks_model.facemodel.
                                               keypoints, :]
            landmarks = ((landmarks + 1) * image_size - 1) / 2.
            landmarks[:, :, :2] = image_size - landmarks[:, :, :
                                                         2]  #---x坐标和y坐标都需要倒置一下
            landmark_loss = landmarks_model.get_landmark_loss(
                _gt_pts[:, :, :2], landmarks[:, :, :2])

            #-------rendered images---
            images = renderer(mesh)
            images = images[:, :, :, :3]  #---get images
            images = images[:, :, :, [2, 1, 0]]  #---rgb to bgr
            images_clone = images.clone()
            #images = images.clamp(0, 255)
            index = (images > 0)
            _render_images[index] = images[index]
            target_images_dir = "debug_images_dir"
            if not os.path.exists(target_images_dir):
                os.makedirs(target_images_dir)

            image_leve_loss = landmarks_model.get_image_level_loss(
                _render_images, _bgr_images, _mask_images)
            #landmark_loss = 1.6e-3 * landmark_loss
            #image_leve_loss = 1.9 * image_leve_loss
            #coeff_loss = 3e-4 * coeff_loss
            #skin_mask_loss = 5 * skin_mask_loss
            landmark_loss = 0.5 * landmark_loss
            image_leve_loss = 0.1 * image_leve_loss
            coeff_loss = coeff_loss
            skin_mask_loss = skin_mask_loss
            loss = image_leve_loss + coeff_loss + skin_mask_loss + landmark_loss
            #_bgr_images = _bgr_images.cpu().detach().numpy()
            #for i in range(batch_size):
            #   a_image = _bgr_images[i]
            #   a_target_image_path = os.path.join(target_images_dir, str(i) + '.jpg')
            #   cv2.imwrite(a_target_image_path, a_image)
            #sys.exit(0)

            #_render_images = _render_images.cpu().detach().numpy()
            #for i in range(batch_size):
            #   a_image = _render_images[i]
            #   a_target_image_path = os.path.join(target_images_dir, str(i) + '.jpg')
            #   cv2.imwrite(a_target_image_path, a_image)

            avg_loss.update(loss.detach().item())
            landmark_avg_loss.update(landmark_loss.detach().item())
            skin_mask_avg_loss.update(skin_mask_loss.detach().item())
            coeff_avg_loss.update(coeff_loss.detach().item())
            pixel_avg_loss.update(image_leve_loss.detach().item())
            if count % 100 == 0:
                print('Iter: [%d, %5d]' % (epoch, i_batch))
                print(' Iter: [%d, %5d]' % (epoch, i_batch) +
                      ' landmark_loss' + ': %.3e' % landmark_avg_loss.avg)
                print(' Iter: [%d, %5d]' % (epoch, i_batch) + ' coeff_loss' +
                      ': %.3e' % coeff_avg_loss.avg)
                print(' Iter: [%d, %5d]' % (epoch, i_batch) +
                      ' skin_mask_loss' + ': %.3e' % skin_mask_avg_loss.avg)
                print(' Iter: [%d, %5d]' % (epoch, i_batch) +
                      ' image_leve_loss' + ': %.3e' % pixel_avg_loss.avg)
                print(' Iter: [%d, %5d]' % (epoch, i_batch) + ' loss' +
                      ': %.3e' % avg_loss.avg)
                print('\n')
                _render_images = _render_images.cpu().detach().numpy()
                _bgr_images = _bgr_images.cpu().detach().numpy()
                for i in range(batch_size):
                    a_image = _render_images[i]
                    b_image = _bgr_images[i]
                    c_image = np.concatenate((a_image, b_image), axis=1)
                    a_target_image_path = os.path.join(target_images_dir,
                                                       str(i) + '.jpg')
                    cv2.imwrite(a_target_image_path, c_image)

                landmark_avg_loss = AverageMeter('landmark_avg_loss', ':.4e')
                skin_mask_loss = AverageMeter('skin_mask_loss', ':.4e')
                pixel_avg_loss = AverageMeter('pixel_avg_loss', ':.4e')
                avg_loss = AverageMeter('loss', ':.4e')

            if count % 500 == 0:
                a_save_name = "_".join([
                    SUFFIX, 'iter', 'epoch',
                    '%d' % epoch, 'i_batch',
                    '%d' % i_batch
                ]) + '.pth'
                a_save_path = os.path.join(WRITE_SNAPSHOT_PATH, a_save_name)
                torch.save(landmarks_model.state_dict(), a_save_path)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            count += 1
Example #26
0
def main():
    print("***** Running training *****")
    print(f"  Task = {args.dataset}")
    print(f"  Num Epochs = {args.epochs}")
    print(f"  Total train batch size = {args.train_batch}")

    trainset, testset, transform_train, transform_test, num_classes = init_data(
    )
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.train_batch,
                                              shuffle=True,
                                              num_workers=args.workers,
                                              drop_last=True,
                                              pin_memory=True)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=args.test_batch,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             drop_last=False,
                                             pin_memory=True)

    model = gen_model(args.model, args.depth, args.widen_factor, num_classes,
                      '', False, 0.1)
    model = model.cuda()
    print('Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))
    wd_params, non_wd_params = [], []
    for name, param in model.named_parameters():
        # if len(param.size()) == 1:
        if 'bn' in name or 'bias' in name:
            non_wd_params.append(
                param)  # bn.weight, bn.bias and classifier.bias, conv2d.bias
            # print(name)
        else:
            wd_params.append(param)
    param_list = [{
        'params': wd_params,
        'weight_decay': args.weight_decay
    }, {
        'params': non_wd_params,
        'weight_decay': 0
    }]
    optimizer = optim.SGD(param_list,
                          lr=args.lr,
                          momentum=args.momentum,
                          nesterov=args.nesterov)
    schedular = MultiStepLR(optimizer, args.decay_step, args.lr_decay)

    # train the model from scratch
    best_acc = 0
    start_epoch = 0
    # Resume
    title = args.task_name
    log_names = [
        'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Train Top5',
        'Valid Top5'
    ]
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        ckpt = torch.load(args.resume)
        model.load_state_dict(ckpt)
        logger = Logger(os.path.join(args.save_path, 'log.txt'),
                        title=title,
                        resume=True)
    else:
        logger = Logger(os.path.join(args.save_path, 'log.txt'), title=title)
        logger.set_names(log_names)

    for epoch in range(args.epochs):
        lr = optimizer.param_groups[0]['lr']
        print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, lr))
        train_loss, train_acc, train_top5 = train(args, trainloader, model,
                                                  optimizer)

        test_loss, test_acc, test_top5 = test(testloader, model)
        log_vals = [
            train_loss, test_loss, train_acc, test_acc, train_top5, test_top5
        ]
        logger.append(log_vals)
        if test_acc > best_acc:
            torch.save(
                model.state_dict(),
                os.path.join(args.save_path, f'{args.net_name}_best.pth'))

        if epoch % args.num_save_epoch == 0 or epoch == args.epochs - 1:
            torch.save(
                model.state_dict(),
                os.path.join(args.save_path, f'{args.net_name}_{epoch}.pth'))

        schedular.step()
        best_acc = max(test_acc, best_acc)
        log_str = f"Epoch: {epoch},"
        for k, v in zip(log_names, log_vals):
            log_str += f"{k}: {v},"
        print(log_str)
    print('Best test acc:', best_acc)
    return model, best_acc
Example #27
0
def main():

    # 1. argparser
    opts = parse(sys.argv[1:])
    print(opts)

    # 3. visdom
    vis = visdom.Visdom(port=opts.port)
    # 4. data set
    train_set = None
    test_set = None

    if opts.data_type == 'voc':
        train_set = VOC_Dataset(root=opts.data_root, split='train', resize=opts.resize)
        test_set = VOC_Dataset(root=opts.data_root, split='test', resize=opts.resize)
        opts.num_classes = 20

    elif opts.data_type == 'coco':
        train_set = COCO_Dataset(root=opts.data_root, set_name='train2017', split='train', resize=opts.resize)
        test_set = COCO_Dataset(root=opts.data_root, set_name='val2017', split='test', resize=opts.resize)
        opts.num_classes = 80

    # 5. data loader
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=opts.batch_size,
                                               collate_fn=train_set.collate_fn,
                                               shuffle=True,
                                               num_workers=4,
                                               pin_memory=True)

    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=1,
                                              collate_fn=test_set.collate_fn,
                                              shuffle=False,
                                              num_workers=2,
                                              pin_memory=True)

    # 6. network
    model = RetinaNet(num_classes=opts.num_classes).to(device)
    model = torch.nn.DataParallel(module=model, device_ids=device_ids)
    coder = RETINA_Coder(opts=opts)  # there is center_anchor in coder.

    # 7. loss
    criterion = Focal_Loss(coder=coder)

    # 8. optimizer
    optimizer = torch.optim.SGD(params=model.parameters(),
                                lr=opts.lr,
                                momentum=opts.momentum,
                                weight_decay=opts.weight_decay)

    # 9. scheduler
    scheduler = MultiStepLR(optimizer=optimizer, milestones=[30, 45], gamma=0.1)

    # 10. resume
    if opts.start_epoch != 0:

        checkpoint = torch.load(os.path.join(opts.save_path, opts.save_file_name) + '.{}.pth.tar'
                                .format(opts.start_epoch - 1), map_location=device)        # 하나 적은걸 가져와서 train
        model.load_state_dict(checkpoint['model_state_dict'])                              # load model state dict
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])                      # load optim state dict
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])                      # load sched state dict
        print('\nLoaded checkpoint from epoch %d.\n' % (int(opts.start_epoch) - 1))

    else:

        print('\nNo check point to resume.. train from scratch.\n')

    # for statement
    for epoch in range(opts.start_epoch, opts.epoch):

        # 11. train
        train(epoch=epoch,
              vis=vis,
              train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              scheduler=scheduler,
              opts=opts)

        # 12. test
        test(epoch=epoch,
             vis=vis,
             test_loader=test_loader,
             model=model,
             criterion=criterion,
             coder=coder,
             opts=opts)

        scheduler.step()
Example #28
0
def train_validate(learningRate, patience, momentum=0.5):
    # train and validate epoches
    model = LetterCNN()
    # weights initialization
    # Init_weights(model)
    model.apply(Init_weights)

    # initalization trainning lists
    avg_loss_per_epoch_list = []
    loss_temp = []  # for batch upadating
    loss_list = []  # record every last batch's loss for each epoch

    accuracy_per_epoch_list = []
    accuracy_temp = []
    accuracy_list = []  ## record every last batch's accuracy for each epoch

    # initialization validation lists
    val_loss_list = []
    val_accuracy_list = []
    val_avg_loss = []

    iterations = len(train_loader)  # how many steps/batches in one epoch

    # define the loss function and optimizer
    lossCriterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(
        [1.4, 1.4, 0.8, 0.8, 1, 0.9, 0.9,
         1]))  # we use crossentropy loss criterion
    optimizer = optim.SGD(model.parameters(),
                          lr=learningRate,
                          momentum=momentum)  # momentum method /lr=0.01 before

    scheduler = MultiStepLR(optimizer, milestones=[6, 15, 20, 30], gamma=0.7)

    # Early stopping start
    # patience = 6
    early_stopping = EarlyStopping(patience=patience, verbose=True)

    for epoch in range(epoches):  # epoches
        # train mode
        model.train()
        for iteration, (images, labels) in enumerate(
                train_loader):  # for each step/block for training loader

            outputs = model(images)

            # collect the loss, last batch' loss and average loss for epoch
            loss = lossCriterion(outputs, labels)  # get loss for every step
            loss_temp.append(loss.item())
            loss1 = loss.item()

            #update weights and do BP
            optimizer.zero_grad()  # To avoid gradient sums
            loss.backward()  # back propagation
            optimizer.step(
            )  #All optimizers implement a step() method, that updates the parameters.

            # print(len(labels))
            total = labels.size(
                0)  # how many labels do you have in this step(batch)
            pro8 = F.softmax(outputs, dim=1).data
            _, predicted = torch.max(
                pro8, 1)  # return the prdicted indices for each row

            # print(outputs.data)
            # .sum()is used to calculate # of elements whose predicts are same as labels
            #but it return in term of tensor, we use item() to retrieve number in it.
            # print((predicted == labels).sum())

            # collect accuracy list for train data
            correct = (predicted == labels).sum().item()

            accuracy_temp.append(correct / total)  # for bacthes

            acc = correct / total  # record accuracy instantly

# print(loss_temp)

        accuracy_list.append(
            acc)  # record every last batch's Accuracy of each epoch

        accuracy_per_epoch_list.append(
            np.average(accuracy_temp
                       ))  # record all batch's average ACCuracy of each epoch

        loss_list.append(loss1)  # record every last batch's LOSS of each epoch

        avg_loss_per_epoch_list.append(np.average(
            loss_temp))  #  record all batch's average LOSS of each epoch

        # if (iteration+1) % iterations ==0: # track all the statistics/10 batches per track
        # print('Trainmodel Epoch[{}/{}],AvgLoss:{:.4f},AvgAccuracy:{:.2f}%'.format(epoch+1,epochs,loss_list[epoch],accuracy_list[epoch]*100))

        print('Trainmodel Epoch[{}/{}],   Loss:{:.4f},   Accuracy:{:.2f}%'.
              format(epoch + 1, epoches, loss1, acc * 100))
        # print(len(accuracy_list))

        ### validation##############################################################################################
        model.eval()
        for j, (images, labels
                ) in enumerate(validation_loader):  # loader with all the data

            outputs = model(images)
            # print(outputs.shape)
            _, predicted = torch.max(F.softmax(outputs, dim=1), 1)

            correct_val = (predicted == labels).sum().item()

            total_val = labels.size(0)

            val_accuracy_list.append(correct_val / total_val)
            val_loss = lossCriterion(outputs, labels)
            val_loss_list.append(val_loss.item())

# early_stopping needs the validation loss to check if it has decresed,
# and if it has, it will make a checkpoint of the current model

#  averageLoss
        val_avg_loss.append(np.average(val_loss_list))
        # clear temp lists to track next epoach
        accuray_temp = []
        loss_temp = []

        print('Validation Epoch[{}/{}]:,  Loss:{:.4f},   Accuracy:{:.2f}%\n'.
              format(epoch + 1, epoches, val_avg_loss[epoch],
                     val_accuracy_list[epoch] * 100))

        # using average loss to do early stopping
        early_stopping(np.average(val_loss_list), model)

        if early_stopping.early_stop:
            print("Early stopping")
            break
        val_loss_list = []

        scheduler.step()

    # checkpoint
    model.load_state_dict(torch.load('checkpoint.pt'))

    # plot the accuracy and loss
    fig = plt.figure(num=2, figsize=(15, 8), dpi=80)
    ax1 = fig.add_subplot(2, 1, 1)
    ax2 = fig.add_subplot(2, 1, 2)

    ax1.plot(range(len(accuracy_list)),
             accuracy_list,
             color='g',
             label='Train_Accuracy')
    ax1.plot(range(len(val_accuracy_list)),
             val_accuracy_list,
             color='r',
             label='Validation_Accuracy')

    ax2.plot(range(len(loss_list)),
             avg_loss_per_epoch_list,
             color='g',
             label='Train_Loss')
    ax2.plot(range(len(val_avg_loss)),
             val_avg_loss,
             color='r',
             label='validation_Loss')

    ax1.set_xlabel('Epochs')
    ax2.set_xlabel('Epochs')

    ax1.set_ylabel('Accuracy')
    ax2.set_ylabel('Loss')

    ax1.set_title('Accuracy')
    ax2.set_title('Loss')

    ax1.legend()
    ax2.legend()

    plt.show()
Example #29
0
def main():

    args.num_classes = get_num_classes(args.dataset)
    model = SGN(args.num_classes, args.dataset, args.seg, args)

    total = get_n_params(model)
    print(model)
    print('The number of parameters: ', total)
    print('The modes is:', args.network)

    if torch.cuda.is_available():
        print('It is using GPU!')
        model = model.cuda()

    criterion = LabelSmoothingLoss(args.num_classes, smoothing=0.1).cuda()
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)

    if args.monitor == 'val_acc':
        mode = 'max'
        monitor_op = np.greater
        best = -np.Inf
        str_op = 'improve'
    elif args.monitor == 'val_loss':
        mode = 'min'
        monitor_op = np.less
        best = np.Inf
        str_op = 'reduce'

    scheduler = MultiStepLR(optimizer, milestones=[60, 90, 110], gamma=0.1)
    # Data loading
    ntu_loaders = NTUDataLoaders(args.dataset, args.case, seg=args.seg)
    train_loader = ntu_loaders.get_train_loader(args.batch_size, args.workers)
    val_loader = ntu_loaders.get_val_loader(args.batch_size, args.workers)
    train_size = ntu_loaders.get_train_size()
    val_size = ntu_loaders.get_val_size()

    test_loader = ntu_loaders.get_test_loader(32, args.workers)

    print('Train on %d samples, validate on %d samples' %
          (train_size, val_size))

    best_epoch = 0
    output_dir = make_dir(args.dataset)

    save_path = os.path.join(output_dir, args.network)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    checkpoint = osp.join(save_path, '%s_best.pth' % args.case)
    earlystop_cnt = 0
    csv_file = osp.join(save_path, '%s_log.csv' % args.case)
    log_res = list()

    lable_path = osp.join(save_path, '%s_lable.txt' % args.case)
    pred_path = osp.join(save_path, '%s_score.npy' % args.case)

    # Training
    if args.train == 1:
        for epoch in range(args.start_epoch, args.max_epochs):

            print(epoch, optimizer.param_groups[0]['lr'])

            t_start = time.time()
            train_loss, train_acc = train(train_loader, model, criterion,
                                          optimizer, epoch)
            val_loss, val_acc = validate(val_loader, model, criterion)
            log_res += [[train_loss, train_acc.cpu().numpy(),\
                         val_loss, val_acc.cpu().numpy()]]

            print(
                'Epoch-{:<3d} {:.1f}s\t'
                'Train: loss {:.4f}\taccu {:.4f}\tValid: loss {:.4f}\taccu {:.4f}'
                .format(epoch + 1,
                        time.time() - t_start, train_loss, train_acc, val_loss,
                        val_acc))

            current = val_loss if mode == 'min' else val_acc

            ####### store tensor in cpu
            current = current.cpu()

            if monitor_op(current, best):
                print('Epoch %d: %s %sd from %.4f to %.4f, '
                      'saving model to %s' % (epoch + 1, args.monitor, str_op,
                                              best, current, checkpoint))
                best = current
                best_epoch = epoch + 1
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'best': best,
                        'monitor': args.monitor,
                        'optimizer': optimizer.state_dict(),
                    }, checkpoint)
                earlystop_cnt = 0
            else:
                print('Epoch %d: %s did not %s' %
                      (epoch + 1, args.monitor, str_op))
                earlystop_cnt += 1

            scheduler.step()

        print('Best %s: %.4f from epoch-%d' % (args.monitor, best, best_epoch))
        with open(csv_file, 'w') as fw:
            cw = csv.writer(fw)
            cw.writerow(['loss', 'acc', 'val_loss', 'val_acc'])
            cw.writerows(log_res)
        print('Save train and validation log into into %s' % csv_file)

    ### Test
    args.train = 0
    model = SGN(args.num_classes, args.dataset, args.seg, args)
    model = model.cuda()
    test(test_loader, model, checkpoint, lable_path, pred_path)
Example #30
0
class Trainer(object):
    def __init__(self,
                 model_name,
                 model,
                 lr,
                 train_on_gpu=False,
                 fp16=False,
                 loss_scaling=False):
        self.model = model
        self.lr = lr
        self.model_name = model_name
        self.train_on_gpu = train_on_gpu
        self.loss_scaling = loss_scaling
        if train_on_gpu and torch.backends.cudnn.enabled:
            self.fp16_mode = fp16
        else:
            self.fp16_mode = False
            self.loss_scaling = False
            print("CuDNN backend not available. Can't train with FP16.")

        self.best_acc = 0
        self.best_epoch = 0
        self._LOSS_SCALE = 128.0

        if self.train_on_gpu:
            self.model = self.model.cuda()

        if self.fp16_mode:
            self.model = self.network_to_half(self.model)
            self.model_params, self.master_params = self.prep_param_list(
                self.model)
        
        # Declare optimizer.
        if not hasattr(self, 'optimizer'):
            if self.fp16_mode:
                self.optimizer = optim.SGD(
                    self.master_params, self.lr, momentum=0.9, weight_decay=5e-4)
            else:
                self.optimizer = optim.SGD(
                    self.model.parameters(),
                    self.lr,
                    momentum=0.9,
                    weight_decay=5e-4)
        self.scheduler = MultiStepLR(
            self.optimizer, milestones=[10, 20, 50, 100, 180], gamma=0.1)
        #if self.train_on_gpu:
        #    self.model = nn.DataParallel(self.model)

        print('\n Model: {} | Training on GPU: {} | Mixed Precision: {} |'
              'Loss Scaling: {}'.format(self.model_name, self.train_on_gpu,
                                        self.fp16_mode, self.loss_scaling))

    def prep_param_list(self, model):
        """
        Create two set of of parameters. One in FP32 and other in FP16.
        Since gradient updates are with numbers that are out of range
        for FP16 this a necessity. We'll update the weights with FP32
        and convert them back to FP16.
        """
        model_params = [p for p in model.parameters() if p.requires_grad]
        master_params = [p.detach().clone().float() for p in model_params]

        for p in master_params:
            p.requires_grad = True

        return model_params, master_params

    def master_params_to_model_params(self, model_params, master_params):
        """
        Move FP32 master params to FP16 model params.
        """
        for model, master in zip(model_params, master_params):
            model.data.copy_(master.data)

    def model_grads_to_master_grads(self, model_params, master_params):
        for model, master in zip(model_params, master_params):
            if master.grad is None:
                master.grad = Variable(master.data.new(*master.data.size()))
            master.grad.data.copy_(model.grad.data)

    def BN_convert_float(self, module):
        '''
        Designed to work with network_to_half.
        BatchNorm layers need parameters in single precision.
        Find all layers and convert them back to float. This can't
        be done with built in .apply as that function will apply
        fn to all modules, parameters, and buffers. Thus we wouldn't
        be able to guard the float conversion based on the module type.
        '''
        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
            module.float()
        for child in module.children():
            self.BN_convert_float(child)
        return module

    class tofp16(nn.Module):
        """
        Add a layer so inputs get converted to FP16.
        Model wrapper that implements::
            def forward(self, input):
                return input.half()
        """

        def __init__(self):
            super(Trainer.tofp16, self).__init__()

        def forward(self, input):
            return input.half()

    def network_to_half(self, network):
        """
        Convert model to half precision in a batchnorm-safe way.
        """
        return nn.Sequential(self.tofp16(),
                             self.BN_convert_float(network.half()))

    def warmup_learning_rate(self, init_lr, no_of_steps, epoch, len_epoch):
        """Warmup learning rate for 5 epoch"""
        factor = no_of_steps // 30
        lr = init_lr * (0.1**factor)
        """Warmup"""
        lr = lr * float(1 + epoch + no_of_steps * len_epoch) / (5. * len_epoch)
        return lr

    def train(self, epoch, no_of_steps, trainloader):
        self.model.train()

        train_loss, correct, total = 0, 0, 0

        # If epoch less than 5 use warmup, else use scheduler.
        if epoch < 5:
            lr = self.warmup_learning_rate(self.lr, no_of_steps, epoch,
                                           len(trainloader))
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = lr
        elif epoch == 5:
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.lr

        #scheduler = MultiStepLR(
        #    self.optimizer, milestones=[80, 120, 160, 180], gamma=0.1)
        #if epoch >= 5:
        #    scheduler.step(epoch=epoch)

        print('Learning Rate: %g' % (list(
            map(lambda group: group['lr'], self.optimizer.param_groups)))[0])
        # Loss criterion is in FP32.
        criterion = nn.CrossEntropyLoss()

        for idx, (inputs, targets) in enumerate(trainloader):
            if self.train_on_gpu:
                inputs, targets = inputs.cuda(), targets.cuda()
            self.model.zero_grad()
            outputs = self.model(inputs)
            # We calculate the loss in FP32 since reduction ops can be
            # wrong when represented in FP16.
            loss = criterion(outputs, targets)
            if self.loss_scaling:
                # Sometime the loss may become small to be represente in FP16
                # So we scale the losses by a large power of 2, 2**7 here.
                loss = loss * self._LOSS_SCALE
            # Calculate the gradients
            loss.backward()
            if self.fp16_mode:
                # Now we move the calculated gradients to the master params
                # so that we can apply the gradient update in FP32.
                self.model_grads_to_master_grads(self.model_params,
                                                 self.master_params)
                if self.loss_scaling:
                    # If we scaled our losses now is a good time to scale it
                    # back since our gradients are in FP32.
                    for params in self.master_params:
                        params.grad.data = params.grad.data / self._LOSS_SCALE
                # Apply weight update in FP32.
                self.optimizer.step()
                # Copy the updated weights back FP16 model weights.
                self.master_params_to_model_params(self.model_params,
                                                   self.master_params)
            else:
                self.optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += (targets == predicted).sum().item()

            progress_bar(
                idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' %
                (train_loss / (idx + 1), 100. * correct / total, correct,
                 total))

        if epoch >= 5: #modified at 2020.09.09
            self.scheduler.step()

    def evaluate(self, epoch, testloader):
        self.model.eval()

        test_loss = 0
        correct = 0
        total = 0

        criterion = nn.CrossEntropyLoss()

        with torch.no_grad():
            for idx, (test_x, test_y) in enumerate(testloader):
                if self.train_on_gpu:
                    test_x, test_y = test_x.cuda(), test_y.cuda()
                outputs = self.model(test_x)
                loss = criterion(outputs, test_y)

                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += test_y.size(0)
                correct += (predicted == test_y).sum().item()

                progress_bar(
                    idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' %
                    (loss / (idx + 1), 100. * correct / total, correct, total))

        acc = 100.0 * correct / total
        if acc > self.best_acc:
            self.save_model(self.model, self.model_name, acc, epoch)

    def save_model(self, model, model_name, acc, epoch):
        state = {
            'net': model.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }

        if self.fp16_mode:
            save_name = os.path.join('weights', model_name + '_fp16',
                                     'weights.%03d.%.03f.pt' % (epoch, acc))
        else:
            save_name = os.path.join('weights', model_name,
                                     'weights.%03d.%.03f.pt' % (epoch, acc))

        if not os.path.exists(os.path.dirname(save_name)):
            os.makedirs(os.path.dirname(save_name))

        torch.save(state, save_name)
        print("\nSaved state at %.03f%% accuracy. Prev accuracy: %.03f%%" %
              (acc, self.best_acc))
        self.best_acc = acc
        self.best_epoch = epoch

    def load_model(self, path=None):
        """
        Load previously saved model. THis doesn't check for precesion type.
        """
        if path is not None:
            checkpoint_name = path
        elif self.fp16_mode:
            checkpoint_name = os.path.join(
                'weights', self.model_name + '_fp16',
                'weights.%03d.%.03f.pt' % (self.best_epoch, self.best_acc))
        else:
            checkpoint_name = os.path.join(
                'weights', self.model_name + '_fp16',
                'weights.%03d.%.03f.pt' % (self.best_epoch, self.best_acc))
        if not os.path.exists(checkpoint_name):
            print("Best model not found")
            return
        checkpoint = torch.load(checkpoint_name)
        self.model.load_state_dict(checkpoint['net'])
        self.best_acc = checkpoint['acc']
        self.best_epoch = checkpoint['epoch']
        print("Loaded Model with accuracy: %.3f%%, from epoch: %d" %
              (checkpoint['acc'], checkpoint['epoch'] + 1))

    def train_and_evaluate(self, traindataloader, testdataloader, no_of_steps):
        self.best_acc = 0.0
        for i in range(no_of_steps):
            print('\nEpoch: %d' % (i + 1))
            self.train(i, no_of_steps, traindataloader)
            self.evaluate(i, testdataloader)
Example #31
0
def train_and_evaluate(model, train_loader, test_loader, optimizer, criterion,
                       criterion_T, accuracy, model_dir, args):

    start_epoch = 0
    best_acc = 0.

    # learning rate schedulers for different models:
    scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=0.1)

    # TensorboardX setup
    writer = SummaryWriter(log_dir=model_dir)  # ensemble
    # writerB = SummaryWriter(logdir = os.path.join(model_dir, 'B')) # ensemble

    # Save best ensemble or average accTop1
    choose_E = False

    # Save the parameters for export
    result_train_metrics = list(range(args.num_epochs))
    result_test_metrics = list(range(args.num_epochs))

    # If the training is interruptted
    if args.resume:
        # Load checkpoint.
        logging.info('Resuming from checkpoint..')
        resumePath = os.path.join(args.resume, 'last.pth')
        assert os.path.isfile(
            resumePath), 'Error: no checkpoint directory found!'
        checkpoint = torch.load(resumePath)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optim_dict'])
        # resume from the last epoch
        start_epoch = checkpoint['epoch']
        scheduler.step(start_epoch - 1)

        if choose_E:
            best_acc = checkpoint['test_accTop1']
        else:
            best_acc = checkpoint['mean_test_accTop1']
        result_train_metrics = torch.load(
            os.path.join(args.resume, 'train_metrics'))
        result_test_metrics = torch.load(
            os.path.join(args.resume, 'test_metrics'))

    for epoch in range(start_epoch, args.num_epochs):

        scheduler.step()

        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, args.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train_metrics = train(train_loader, model, optimizer, criterion,
                              criterion_T, accuracy, args)

        writer.add_scalar('Train/Loss', train_metrics['train_loss'], epoch + 1)
        # writer.add_scalar('Train/Loss_True', train_metrics['train_true_loss'], epoch+1)
        # writer.add_scalar('Train/Loss_Group', train_metrics['train_group_loss'], epoch+1)
        writer.add_scalar('Train/AccTop1', train_metrics['train_accTop1'],
                          epoch + 1)

        # Evaluate for one epoch on validation set
        test_metrics = evaluate(test_loader, model, criterion, criterion_T,
                                accuracy, args)

        # Find the best accTop1 for Branch1.
        if choose_E:
            test_acc = test_metrics['test_accTop1']
        else:
            test_acc = test_metrics['mean_test_accTop1']

        writer.add_scalar('Test/Loss', test_metrics['test_loss'], epoch + 1)
        # writer.add_scalar('Test/Loss_True', test_metrics['test_true_loss'], epoch+1)
        # writer.add_scalar('Test/Loss_Group', test_metrics['test_group_loss'], epoch+1)
        writer.add_scalar('Test/AccTop1', test_metrics['test_accTop1'],
                          epoch + 1)

        result_train_metrics[epoch] = train_metrics
        result_test_metrics[epoch] = test_metrics

        # Save latest train/test metrics
        torch.save(result_train_metrics,
                   os.path.join(model_dir, 'train_metrics'))
        torch.save(result_test_metrics, os.path.join(model_dir,
                                                     'test_metrics'))

        last_path = os.path.join(model_dir, 'last.pth')
        # Save latest model weights, optimizer and accuracy
        torch.save(
            {
                'state_dict': model.state_dict(),
                'epoch': epoch + 1,
                'optim_dict': optimizer.state_dict(),
                'test_accTop1': test_metrics['test_accTop1'],
                'mean_test_accTop1': test_metrics['mean_test_accTop1']
            }, last_path)
        # If best_eval, best_save_path
        is_best = test_acc >= best_acc
        if is_best:
            logging.info("- Found better accuracy")
            best_acc = test_acc
            # Save best metrics in a json file in the model directory
            test_metrics['epoch'] = epoch + 1
            utils.save_dict_to_json(
                test_metrics, os.path.join(model_dir,
                                           "test_best_metrics.json"))

            # Save model and optimizer
            shutil.copyfile(last_path, os.path.join(model_dir, 'best.pth'))
    writer.close()
def main(args):
    model = load_config(args.model)
    dataset = load_config(args.dataset)

    device = torch.device('cuda' if model['common']['cuda'] else 'cpu')

    if model['common']['cuda'] and not torch.cuda.is_available():
        sys.exit('Error: CUDA requested but not available')

    # if args.batch_size < 2:
    #     sys.exit('Error: PSPNet requires more than one image for BatchNorm in Pyramid Pooling')

    os.makedirs(model['common']['checkpoint'], exist_ok=True)

    num_classes = len(dataset['common']['classes'])
    net = UNet(num_classes).to(device)

    if args.resume:
        path = os.path.join(model['common']['checkpoint'], args.resume)

        cuda = model['common']['cuda']

        def map_location(storage, _):
            return storage.cuda() if cuda else storage.cpu()

        chkpt = torch.load(path, map_location=map_location)
        net.load_state_dict(chkpt)
        resume_at_epoch = int(args.resume[11:16])
    else:
        resume_at_epoch = 0

    if model['common']['cuda']:
        torch.backends.cudnn.benchmark = True
        net = DataParallel(net)

    optimizer = SGD(net.parameters(), lr=model['opt']['lr'], momentum=model['opt']['momentum'])

    scheduler = MultiStepLR(optimizer, milestones=model['opt']['milestones'], gamma=model['opt']['gamma'])

    weight = torch.Tensor(dataset['weights']['values'])

    for i in range(resume_at_epoch):
        scheduler.step()

    criterion = CrossEntropyLoss2d(weight=weight).to(device)
    # criterion = FocalLoss2d(weight=weight).to(device)

    train_loader, val_loader = get_dataset_loaders(model, dataset)

    num_epochs = model['opt']['epochs']

    history = collections.defaultdict(list)

    for epoch in range(resume_at_epoch, num_epochs):
        print('Epoch: {}/{}'.format(epoch + 1, num_epochs))

        train_hist = train(train_loader, num_classes, device, net, optimizer, scheduler, criterion)
        print('Train loss: {:.4f}, mean IoU: {:.4f}'.format(train_hist['loss'], train_hist['iou']))

        for k, v in train_hist.items():
            history['train ' + k].append(v)

        val_hist = validate(val_loader, num_classes, device, net, criterion)
        print('Validate loss: {:.4f}, mean IoU: {:.4f}'.format(val_hist['loss'], val_hist['iou']))

        for k, v in val_hist.items():
            history['val ' + k].append(v)

        visual = 'history-{:05d}-of-{:05d}.png'.format(epoch + 1, num_epochs)
        plot(os.path.join(model['common']['checkpoint'], visual), history)

        checkpoint = 'checkpoint-{:05d}-of-{:05d}.pth'.format(epoch + 1, num_epochs)
        torch.save(net.state_dict(), os.path.join(model['common']['checkpoint'], checkpoint))