def train_infinite_collect_stats(args, model, device, source_train_loader,
                                 target_train_loader, optimizer,
                                 lambda_mec_loss, target_test_loader):

    source_iter = iter(source_train_loader)
    target_iter = iter(target_train_loader)

    exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer,
                                                milestones=[6000],
                                                gamma=0.1)

    for i in range(args.num_iters):
        model.train()

        exp_lr_scheduler.step()
        try:
            source_data, source_y = next(source_iter)
        except:
            source_iter = iter(source_train_loader)
            source_data, source_y = next(source_iter)

        try:
            target_data, target_data_dup, _ = next(target_iter)
        except:
            target_iter = iter(target_train_loader)
            target_data, target_data_dup, _ = next(target_iter)

        data = torch.cat((source_data, target_data, target_data_dup),
                         dim=0)  # concat the source and target mini-batches
        data, source_y = data.to(device), source_y.to(device)

        optimizer.zero_grad()
        output = model(data)
        source_output, target_output, target_output_dup = torch.split(
            output, split_size_or_sections=output.shape[0] // 3, dim=0)

        mec_criterion = consensus_loss.MinEntropyConsensusLoss(
            num_classes=args.num_classes, device=device)

        cls_loss = F.nll_loss(F.log_softmax(source_output), source_y)
        mec_loss = lambda_mec_loss * mec_criterion(target_output,
                                                   target_output_dup)

        loss = cls_loss + mec_loss
        loss.backward()

        optimizer.step()

        if i % args.log_interval == 0:
            print(
                'Train Iter: [{}/{}]\tClassification Loss: {:.6f} \t MEC Loss: {:.6f}'
                .format(i, args.num_iters, cls_loss.item(), mec_loss.item()))

        if (i + 1) % args.check_acc_step == 0:
            test(args, model, device, target_test_loader)

    print("Training is complete...")
    print(
        "Running a bunch of forward passes to estimate the population statistics of target..."
    )
    eval_pass_collect_stats(args, model, device, target_test_loader)
    print("Finally computing the precision on the test set...")
    test(args, model, device, target_test_loader)
Example #2
0
def train_meshnet(opt):

    device = torch.device("cpu" if opt.gpu_idx < 0 else "cuda:%d" %
                          opt.gpu_idx)
    print('Training on {} GPUs'.format(torch.cuda.device_count()))
    print('Training on ' + (
        'cpu' if opt.gpu_idx < 0 else torch.cuda.get_device_name(opt.gpu_idx)))

    # colored console output
    green = lambda x: '\033[92m' + x + '\033[0m'
    blue = lambda x: '\033[94m' + x + '\033[0m'

    log_dirname = os.path.join(opt.logdir, opt.name)
    params_filename = os.path.join(opt.outdir, '%s_params.pth' % opt.name)
    model_filename = os.path.join(opt.outdir, '%s_model.pth' % opt.name)
    desc_filename = os.path.join(opt.outdir, '%s_description.txt' % opt.name)

    if os.path.exists(log_dirname) or os.path.exists(model_filename):
        if opt.name != 'test':
            response = input(
                'A training run named "%s" already exists, overwrite? (y/n) ' %
                opt.name)
            if response == 'y':
                del_log = True
            else:
                return
        else:
            del_log = True

        if del_log:
            if os.path.exists(log_dirname):
                try:
                    shutil.rmtree(log_dirname)
                except OSError:
                    print("Can't delete " + log_dirname)

    # get indices in targets and predictions corresponding to each output
    target_features = []
    output_target_ind = []
    output_pred_ind = []
    output_names = []
    output_loss_weights = dict()
    pred_dim = 0
    for o in opt.outputs:
        if o == 'imp_surf':
            if o not in target_features:
                target_features.append(o)

            output_names.append(o)
            output_target_ind.append(target_features.index(o))
            output_pred_ind.append(pred_dim)
            output_loss_weights[o] = 1.0
            pred_dim += 1
        elif o == 'imp_surf_magnitude':
            if o not in target_features:
                target_features.append(o)

            output_names.append(o)
            output_target_ind.append(target_features.index(o))
            output_pred_ind.append(pred_dim)
            # output_loss_weights[o] = 10.0
            output_loss_weights[o] = 1.0
            pred_dim += 1
        elif o == 'imp_surf_sign':
            if o not in target_features:
                target_features.append(o)

            output_names.append(o)
            output_target_ind.append(target_features.index(o))
            output_pred_ind.append(pred_dim)
            output_loss_weights[o] = 1.0
            pred_dim += 1
        elif o == 'p_index':
            if o not in target_features:
                target_features.append(o)

            output_target_ind.append(target_features.index(o))
        elif o == 'patch_pts_ids':
            if o not in target_features:
                target_features.append(o)

            output_target_ind.append(target_features.index(o))
        else:
            raise ValueError('Unknown output: %s' % o)

    if pred_dim <= 0:
        raise ValueError('Prediction is empty for the given outputs.')

    # create model
    use_query_point = any([
        f in opt.outputs
        for f in ['imp_surf', 'imp_surf_magnitude', 'imp_surf_sign']
    ])
    meshnet = PointsToSurfModel(
        net_size_max=opt.net_size,
        num_points=opt.points_per_patch,
        output_dim=pred_dim,
        use_point_stn=opt.use_point_stn,
        use_feat_stn=opt.use_feat_stn,
        sym_op=opt.sym_op,
        use_query_point=use_query_point,
        sub_sample_size=opt.sub_sample_size,
        do_augmentation=True,
        single_transformer=opt.single_transformer,
        shared_transformer=opt.shared_transformer,
    )

    start_epoch = 0
    if opt.refine != '':
        print(f'Refining weights from {opt.refine}')
        meshnet.cuda(device=device)  # same order as in training
        meshnet = torch.nn.DataParallel(meshnet)
        meshnet.load_state_dict(torch.load(opt.refine))
        try:
            # expecting a file name like 'vanilla_model_50.pth'
            model_file = str(opt.refine)
            last_underscore_pos = model_file.rfind('_')
            last_dot_pos = model_file.rfind('.')
            start_epoch = int(
                model_file[last_underscore_pos + 1:last_dot_pos]) + 1
            print(f'Continuing training from epoch {start_epoch}')
        except:
            print(
                f'Warning: {opt.refine} has no epoch in the name. The Tensorboard log will continue at '
                f'epoch 0 and might be messed up!')

    if opt.seed < 0:
        opt.seed = random.randint(1, 10000)

    print("Random Seed: %d" % opt.seed)
    random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    # create train and test dataset loaders
    train_dataset = data_loader.PointcloudPatchDataset(
        root=opt.indir,
        shape_list_filename=opt.trainset,
        points_per_patch=opt.points_per_patch,
        patch_features=target_features,
        point_count_std=opt.patch_point_count_std,
        seed=opt.seed,
        identical_epochs=opt.identical_epochs,
        center=opt.patch_center,
        cache_capacity=opt.cache_capacity,
        pre_processed_patches=True,
        sub_sample_size=opt.sub_sample_size,
        num_workers=int(opt.workers),
        patch_radius=opt.patch_radius,
        epsilon=-1,  # not necessary for training
        uniform_subsample=opt.uniform_subsample,
    )
    if opt.training_order == 'random':
        train_datasampler = data_loader.RandomPointcloudPatchSampler(
            train_dataset,
            patches_per_shape=opt.patches_per_shape,
            seed=opt.seed,
            identical_epochs=opt.identical_epochs)
    elif opt.training_order == 'random_shape_consecutive':
        train_datasampler = data_loader.SequentialShapeRandomPointcloudPatchSampler(
            train_dataset,
            patches_per_shape=opt.patches_per_shape,
            seed=opt.seed,
            identical_epochs=opt.identical_epochs)
    else:
        raise ValueError('Unknown training order: %s' % opt.training_order)

    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   sampler=train_datasampler,
                                                   batch_size=opt.batchSize,
                                                   num_workers=int(
                                                       opt.workers))

    test_dataset = data_loader.PointcloudPatchDataset(
        root=opt.indir,
        shape_list_filename=opt.testset,
        points_per_patch=opt.points_per_patch,
        patch_features=target_features,
        point_count_std=opt.patch_point_count_std,
        seed=opt.seed,
        identical_epochs=opt.identical_epochs,
        center=opt.patch_center,
        cache_capacity=opt.cache_capacity,
        pre_processed_patches=True,
        sub_sample_size=opt.sub_sample_size,
        patch_radius=opt.patch_radius,
        num_workers=int(opt.workers),
        epsilon=-1,  # not necessary for training
        uniform_subsample=opt.uniform_subsample,
    )
    if opt.training_order == 'random':
        test_datasampler = data_loader.RandomPointcloudPatchSampler(
            test_dataset,
            patches_per_shape=opt.patches_per_shape,
            seed=opt.seed,
            identical_epochs=opt.identical_epochs)
    elif opt.training_order == 'random_shape_consecutive':
        test_datasampler = data_loader.SequentialShapeRandomPointcloudPatchSampler(
            test_dataset,
            patches_per_shape=opt.patches_per_shape,
            seed=opt.seed,
            identical_epochs=opt.identical_epochs)
    else:
        raise ValueError('Unknown training order: %s' % opt.training_order)

    test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                                  sampler=test_datasampler,
                                                  batch_size=opt.batchSize,
                                                  num_workers=int(opt.workers))

    # keep the exact training shape names for later reference
    opt.train_shapes = train_dataset.shape_names
    opt.test_shapes = test_dataset.shape_names

    print(
        'training set: %d patches (in %d batches) - test set: %d patches (in %d batches)'
        % (len(train_datasampler), len(train_dataloader),
           len(test_datasampler), len(test_dataloader)))

    try:
        os.makedirs(opt.outdir)
    except OSError:
        pass

    train_fraction_done = 0.0

    log_writer = SummaryWriter(log_dirname, comment=opt.name)
    log_writer.add_scalar('LR', opt.lr, 0)

    # milestones in number of optimizer iterations
    optimizer = optim.SGD(meshnet.parameters(),
                          lr=opt.lr,
                          momentum=opt.momentum)

    # SGD changes lr depending on training progress
    # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=0.1)  # constant lr
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=opt.scheduler_steps,
                                         gamma=0.1)

    if opt.refine == '':
        meshnet.cuda(device=device)
        meshnet = torch.nn.DataParallel(meshnet)

    train_num_batch = len(train_dataloader)
    test_num_batch = len(test_dataloader)

    # save parameters
    torch.save(opt, params_filename)

    # save description
    with open(desc_filename, 'w+') as text_file:
        print(opt.desc, file=text_file)

    for epoch in range(start_epoch, opt.nepoch, 1):

        train_enum = enumerate(train_dataloader, 0)

        test_batchind = -1
        test_fraction_done = 0.0
        test_enum = enumerate(test_dataloader, 0)

        for train_batchind, batch_data_train in train_enum:

            # batch data to GPU
            for key in batch_data_train.keys():
                batch_data_train[key] = batch_data_train[key].cuda(
                    non_blocking=True)

            # set to training mode
            meshnet.train()

            # zero gradients
            optimizer.zero_grad()

            pred_train = meshnet(batch_data_train)

            loss_train = compute_loss(pred=pred_train,
                                      batch_data=batch_data_train,
                                      outputs=opt.outputs,
                                      output_loss_weights=output_loss_weights,
                                      fixed_radius=opt.patch_radius > 0.0)

            loss_total = sum(loss_train)

            # back-propagate through entire network to compute gradients of loss w.r.t. parameters
            loss_total.backward()

            # parameter optimization step
            optimizer.step()

            # update and log lr
            lr_before_update = scheduler.get_lr()
            if isinstance(lr_before_update, list):
                lr_before_update = lr_before_update[0]
            scheduler.step(epoch)
            lr_after_update = scheduler.get_lr()
            if isinstance(lr_after_update, list):
                lr_after_update = lr_after_update[0]
            if lr_before_update != lr_after_update:
                print('LR changed from {} to {} in epoch {}'.format(
                    lr_before_update, lr_after_update, epoch))
            current_step = (
                epoch + train_fraction_done) * train_num_batch * opt.batchSize
            log_writer.add_scalar('LR', lr_after_update, current_step)

            train_fraction_done = (train_batchind + 1) / train_num_batch

            if debug:
                from source import evaluation
                evaluation.visualize_patch(
                    patch_pts_ps=batch_data_train['patch_pts_ps'][0].cpu(),
                    query_point_ps=batch_data_train['imp_surf_query_point_ps']
                    [0].cpu(),
                    pts_sub_sample_ms=batch_data_train['pts_sub_sample_ms']
                    [0].cpu(),
                    query_point_ms=batch_data_train['imp_surf_query_point_ms']
                    [0].cpu(),
                    file_path='debug/patch_train.off')

            metrics_dict = calc_metrics(outputs=opt.outputs,
                                        pred=pred_train,
                                        gt_data=batch_data_train)

            do_logging(writer=log_writer,
                       log_prefix=green('train'),
                       epoch=epoch,
                       opt=opt,
                       loss=loss_train,
                       batchind=train_batchind,
                       fraction_done=train_fraction_done,
                       num_batch=train_num_batch,
                       train=True,
                       output_names=output_names,
                       metrics_dict=metrics_dict)

            while test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch:

                # set to evaluation mode, no auto-diff
                meshnet.eval()

                test_batchind, batch_data_test = next(test_enum)

                # batch data to GPU
                for key in batch_data_test.keys():
                    batch_data_test[key] = batch_data_test[key].cuda(
                        non_blocking=True)

                # forward pass
                with torch.no_grad():
                    pred_test = meshnet(batch_data_test)

                loss_test = compute_loss(
                    pred=pred_test,
                    batch_data=batch_data_test,
                    outputs=opt.outputs,
                    output_loss_weights=output_loss_weights,
                    fixed_radius=opt.patch_radius > 0.0)

                metrics_dict = calc_metrics(outputs=opt.outputs,
                                            pred=pred_test,
                                            gt_data=batch_data_test)

                test_fraction_done = (test_batchind + 1) / test_num_batch

                do_logging(writer=log_writer,
                           log_prefix=blue('test'),
                           epoch=epoch,
                           opt=opt,
                           loss=loss_test,
                           batchind=test_batchind,
                           fraction_done=test_fraction_done,
                           num_batch=train_num_batch,
                           train=False,
                           output_names=output_names,
                           metrics_dict=metrics_dict)

        # end of epoch save model, overwriting the old model
        if epoch % opt.saveinterval == 0 or epoch == opt.nepoch - 1:
            torch.save(meshnet.state_dict(), model_filename)

        # save model in a separate file in epochs 0,5,10,50,100,500,1000, ...
        if epoch % (5 * 10**math.floor(math.log10(max(2, epoch - 1)))
                    ) == 0 or epoch % 100 == 0 or epoch == opt.nepoch - 1:
            torch.save(
                meshnet.state_dict(),
                os.path.join(opt.outdir,
                             '%s_model_%d.pth' % (opt.name, epoch)))

        log_writer.flush()

    log_writer.close()
def main():
    torch.manual_seed(args.seed)
    if not args.use_avai_gpus:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    if args.use_cpu: use_gpu = False

    if not args.evaluate:
        sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'))
    else:
        sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt'))
    print("==========\nArgs:{}\n==========".format(args))

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        cudnn.benchmark = True
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_imgreid_dataset(
        root=args.root,
        name=args.dataset,
        split_id=args.split_id,
        cuhk03_labeled=args.cuhk03_labeled,
        cuhk03_classic_split=args.cuhk03_classic_split,
    )

    transform_train = T.Compose([
        T.Random2DTranslation(args.height, args.width),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    transform_test = T.Compose([
        T.Resize((args.height, args.width)),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    pin_memory = True if use_gpu else False

    trainloader = DataLoader(
        ImageDataset(dataset.train, transform=transform_train),
        batch_size=args.train_batch,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=pin_memory,
        drop_last=True,
    )

    queryloader = DataLoader(
        ImageDataset(dataset.query, transform=transform_test),
        batch_size=args.test_batch,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=pin_memory,
        drop_last=False,
    )

    galleryloader = DataLoader(
        ImageDataset(dataset.gallery, transform=transform_test),
        batch_size=args.test_batch,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=pin_memory,
        drop_last=False,
    )

    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch,
                              num_classes=dataset.num_train_pids,
                              loss={'xent'},
                              use_gpu=use_gpu)
    print("Model size: {:.3f} M".format(count_num_param(model)))

    if args.label_smooth:
        criterion = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids,
                                            use_gpu=use_gpu)
    else:
        criterion = nn.CrossEntropyLoss()
    optimizer = init_optim(args.optim, model.parameters(), args.lr,
                           args.weight_decay)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=args.stepsize,
                                         gamma=args.gamma)

    if args.fixbase_epoch > 0:
        if hasattr(model, 'classifier') and isinstance(model.classifier,
                                                       nn.Module):
            optimizer_tmp = init_optim(args.optim,
                                       model.classifier.parameters(),
                                       args.fixbase_lr, args.weight_decay)
        else:
            print(
                "Warn: model has no attribute 'classifier' and fixbase_epoch is reset to 0"
            )
            args.fixbase_epoch = 0

    if args.load_weights:
        # load pretrained weights but ignore layers that don't match in size
        if check_isfile(args.load_weights):
            checkpoint = torch.load(args.load_weights)
            pretrain_dict = checkpoint['state_dict']
            model_dict = model.state_dict()
            pretrain_dict = {
                k: v
                for k, v in pretrain_dict.items()
                if k in model_dict and model_dict[k].size() == v.size()
            }
            model_dict.update(pretrain_dict)
            model.load_state_dict(model_dict)
            print("Loaded pretrained weights from '{}'".format(
                args.load_weights))

    if args.resume:
        if check_isfile(args.resume):
            checkpoint = torch.load(args.resume)
            model.load_state_dict(checkpoint['state_dict'])
            args.start_epoch = checkpoint['epoch']
            rank1 = checkpoint['rank1']
            print("Loaded checkpoint from '{}'".format(args.resume))
            print("- start_epoch: {}\n- rank1: {}".format(
                args.start_epoch, rank1))

    if use_gpu:
        model = nn.DataParallel(model).cuda()

    if args.evaluate:
        print("Evaluate only")
        distmat = test(model,
                       queryloader,
                       galleryloader,
                       use_gpu,
                       return_distmat=True)
        if args.vis_ranked_res:
            visualize_ranked_results(
                distmat,
                dataset,
                save_dir=osp.join(args.save_dir, 'ranked_results'),
                topk=20,
            )
        return

    start_time = time.time()
    train_time = 0
    best_rank1 = -np.inf
    best_epoch = 0
    print("==> Start training")

    if args.fixbase_epoch > 0:
        print(
            "Train classifier for {} epochs while keeping base network frozen".
            format(args.fixbase_epoch))

        for epoch in range(args.fixbase_epoch):
            start_train_time = time.time()
            train(epoch,
                  model,
                  criterion,
                  optimizer_tmp,
                  trainloader,
                  use_gpu,
                  freeze_bn=True)
            train_time += round(time.time() - start_train_time)

        del optimizer_tmp
        print("Now open all layers for training")

    for epoch in range(args.start_epoch, args.max_epoch):
        start_train_time = time.time()
        train(epoch, model, criterion, optimizer, trainloader, use_gpu)
        train_time += round(time.time() - start_train_time)

        scheduler.step()

        if (epoch + 1) > args.start_eval and args.eval_step > 0 and (
                epoch + 1) % args.eval_step == 0 or (epoch +
                                                     1) == args.max_epoch:
            print("==> Test")
            rank1 = test(model, queryloader, galleryloader, use_gpu)
            is_best = rank1 > best_rank1

            if is_best:
                best_rank1 = rank1
                best_epoch = epoch + 1

            if use_gpu:
                state_dict = model.module.state_dict()
            else:
                state_dict = model.state_dict()

            save_checkpoint(
                {
                    'state_dict': state_dict,
                    'rank1': rank1,
                    'epoch': epoch,
                }, is_best,
                osp.join(args.save_dir,
                         'checkpoint_ep' + str(epoch + 1) + '.pth.tar'))

    print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(
        best_rank1, best_epoch))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print(
        "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".
        format(elapsed, train_time))
Example #4
0
    if args.pretrained:
        optimizer = torch.optim.SGD(
            [{'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': args.lr},
             {'params': filter(lambda p: p.requires_grad, metric.parameters()), 'lr': 10 * args.lr}],
            weight_decay=args.weight_decay, momentum=args.momentum)
    else:
        optimizer = torch.optim.SGD(
            [{'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': args.lr},
             {'params': filter(lambda p: p.requires_grad, metric.parameters()), 'lr': args.lr}],
            weight_decay=args.weight_decay, momentum=args.momentum)
    print('init_lr={}, weight_decay={}, momentum={}'.format(args.lr, args.weight_decay, args.momentum))

    if args.scheduler == 'step':
        scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_step, gamma=args.lr_gamma, last_epoch=-1)
    elif args.scheduler == 'multi':
        scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[150, 225], gamma=args.lr_gamma, last_epoch=-1)

    # savepath
    savepath = os.path.join(args.savepath, args.model_name)

    savepath = savepath + '_' + args.metric

    savepath = savepath + '_' + args.loss + '_' + str(args.inp_size) + '_' + args.scheduler

    if args.seed is not None:
        savepath = savepath + '_s' + str(args.seed)

    if not args.pretrained:
        savepath = savepath + '_' + str(args.eps)

    print('savepath:', savepath)
Example #5
0
def create_optimizer(optimizer_config, model):
    """Creates optimizer and schedule from configuration

    Parameters
    ----------
    optimizer_config : dict
        Dictionary containing the configuration options for the optimizer.
    model : Model
        The network model.

    Returns
    -------
    optimizer : Optimizer
        The optimizer.
    scheduler : LRScheduler
        The learning rate scheduler.
    """
    if optimizer_config["classifier_lr"] != -1:
        # Separate classifier parameters from all others
        net_params = []
        classifier_params = []
        for k, v in model.named_parameters():
            if k.find("fc") != -1:
                classifier_params.append(v)
            else:
                net_params.append(v)
        params = [
            {"params": net_params},
            {"params": classifier_params, "lr": optimizer_config["classifier_lr"]},
        ]
    else:
        params = model.parameters()

    if optimizer_config["type"] == "SGD":
        optimizer = optim.SGD(params,
                              lr=optimizer_config["learning_rate"],
                              momentum=optimizer_config["momentum"],
                              weight_decay=optimizer_config["weight_decay"],
                              nesterov=optimizer_config["nesterov"])
    elif optimizer_config["type"] == "Adam":
        optimizer = optim.Adam(params,
                               lr=optimizer_config["learning_rate"],
                               weight_decay=optimizer_config["weight_decay"])
    else:
        raise KeyError("unrecognized optimizer {}".format(optimizer_config["type"]))

    if optimizer_config["schedule"]["type"] == "step":
        scheduler = lr_scheduler.StepLR(optimizer, **optimizer_config["schedule"]["params"])
    elif optimizer_config["schedule"]["type"] == "multistep":
        scheduler = lr_scheduler.MultiStepLR(optimizer, **optimizer_config["schedule"]["params"])
    elif optimizer_config["schedule"]["type"] == "exponential":
        scheduler = lr_scheduler.ExponentialLR(optimizer, **optimizer_config["schedule"]["params"])
    elif optimizer_config["schedule"]["type"] == "constant":
        scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0)
    elif optimizer_config["schedule"]["type"] == "linear":
        def linear_lr(it):
            return it * optimizer_config["schedule"]["params"]["alpha"] + optimizer_config["schedule"]["params"]["beta"]

        scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr)

    return optimizer, scheduler
Example #6
0
def main():
    global args

    torch.manual_seed(args.seed)
    if not args.use_avai_gpus:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    if args.use_cpu:
        use_gpu = False
    log_name = 'log_test.txt' if args.evaluate else 'log_train.txt'
    sys.stderr = sys.stdout = Logger(osp.join(args.save_dir, log_name))
    print("==========\nArgs:{}\n==========".format(args))

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        cudnn.benchmark = True
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU, however, GPU is highly recommended")

    print("Initializing image data manager")
    dm = ImageDataManager(use_gpu, **image_dataset_kwargs(args))
    trainloader, testloader_dict = dm.return_dataloaders()

    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch,
                              num_classes=dm.num_train_pids,
                              loss={'xent'},
                              use_gpu=use_gpu,
                              args=vars(args))
    print(model)
    print("Model size: {:.3f} M".format(count_num_param(model)))

    criterion = get_criterion(dm.num_train_pids, use_gpu, args)
    regularizer = get_regularizer(vars(args))
    optimizer = init_optimizer(model.parameters(), **optimizer_kwargs(args))
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=args.stepsize,
                                         gamma=args.gamma)

    if args.load_weights and check_isfile(args.load_weights):
        # load pretrained weights but ignore layers that don't match in size
        try:
            checkpoint = torch.load(args.load_weights)
        except Exception as e:
            print(e)
            checkpoint = torch.load(args.load_weights,
                                    map_location={'cuda:0': 'cpu'})

        pretrain_dict = checkpoint['state_dict']
        model_dict = model.state_dict()
        pretrain_dict = {
            k: v
            for k, v in pretrain_dict.items()
            if k in model_dict and model_dict[k].size() == v.size()
        }
        model_dict.update(pretrain_dict)
        model.load_state_dict(model_dict)
        print("Loaded pretrained weights from '{}'".format(args.load_weights))

    if args.resume and check_isfile(args.resume):
        checkpoint = torch.load(args.resume)
        state = model.state_dict()
        state.update(checkpoint['state_dict'])
        model.load_state_dict(state)
        # args.start_epoch = checkpoint['epoch'] + 1
        print("Loaded checkpoint from '{}'".format(args.resume))
        print("- start_epoch: {}\n- rank1: {}".format(args.start_epoch,
                                                      checkpoint['rank1']))

    if use_gpu:
        model = nn.DataParallel(model).cuda()

    if args.evaluate:
        print("Evaluate only")

        for name in args.target_names:
            print("Evaluating {} ...".format(name))
            queryloader = testloader_dict[name]['query'], testloader_dict[
                name]['query_flip']
            galleryloader = testloader_dict[name]['gallery'], testloader_dict[
                name]['gallery_flip']
            distmat = test(model,
                           queryloader,
                           galleryloader,
                           use_gpu,
                           return_distmat=True)

            if args.visualize_ranks:
                visualize_ranked_results(distmat,
                                         dm.return_testdataset_by_name(name),
                                         save_dir=osp.join(
                                             args.save_dir, 'ranked_results',
                                             name),
                                         topk=20)
        return

    start_time = time.time()
    ranklogger = RankLogger(args.source_names, args.target_names)
    train_time = 0
    print("==> Start training")

    if args.fixbase_epoch > 0:
        oldenv = os.environ.get('sa', '')
        os.environ['sa'] = ''
        print(
            "Train {} for {} epochs while keeping other layers frozen".format(
                args.open_layers, args.fixbase_epoch))
        initial_optim_state = optimizer.state_dict()

        for epoch in range(args.fixbase_epoch):
            start_train_time = time.time()
            train(epoch,
                  model,
                  criterion,
                  regularizer,
                  optimizer,
                  trainloader,
                  use_gpu,
                  fixbase=True)
            train_time += round(time.time() - start_train_time)

        print("Done. All layers are open to train for {} epochs".format(
            args.max_epoch))
        optimizer.load_state_dict(initial_optim_state)
        os.environ['sa'] = oldenv

    max_r1 = 0

    for epoch in range(args.start_epoch, args.max_epoch):
        start_train_time = time.time()
        print(epoch)
        print(criterion)

        train(epoch,
              model,
              criterion,
              regularizer,
              optimizer,
              trainloader,
              use_gpu,
              fixbase=False)
        train_time += round(time.time() - start_train_time)

        if use_gpu:
            state_dict = model.module.state_dict()
        else:
            state_dict = model.state_dict()

        save_checkpoint(
            {
                'state_dict': state_dict,
                'rank1': 0,
                'epoch': epoch,
            }, False,
            osp.join(args.save_dir,
                     'checkpoint_ep' + str(epoch + 1) + '.pth.tar'))

        scheduler.step()

        if (epoch + 1) > args.start_eval and args.eval_freq > 0 and (
                epoch + 1) % args.eval_freq == 0 or (epoch +
                                                     1) == args.max_epoch:
            print("==> Test")

            for name in args.target_names:
                print("Evaluating {} ...".format(name))
                queryloader = testloader_dict[name]['query'], testloader_dict[
                    name]['query_flip']
                galleryloader = testloader_dict[name][
                    'gallery'], testloader_dict[name]['gallery_flip']
                rank1 = test(model, queryloader, galleryloader, use_gpu)
                ranklogger.write(name, epoch + 1, rank1)

            if use_gpu:
                state_dict = model.module.state_dict()
            else:
                state_dict = model.state_dict()

            if max_r1 < rank1:
                print('Save!', max_r1, rank1)
                save_checkpoint(
                    {
                        'state_dict': state_dict,
                        'rank1': rank1,
                        'epoch': epoch,
                    }, False, osp.join(args.save_dir,
                                       'checkpoint_best.pth.tar'))

                max_r1 = rank1

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print(
        "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".
        format(elapsed, train_time))
    ranklogger.show_summary()
    def __init__(self, opt):
        super(ICPR_model, self).__init__(opt)
        train_opt = opt['train']

        # define networks and load pretrained models
        self.netG = networks.define_G1(opt).to(self.device)  # G1
        if self.is_train:
            self.netV = networks.define_D(opt).to(self.device)  # G1
            self.netD = networks.define_D2(opt).to(self.device)
            self.netQ = networks.define_Q(opt).to(self.device)
            self.netG.train()
            self.netV.train()
            self.netD.train()
        self.load()  # load G and D if needed

        # define losses, optimizer and scheduler
        if self.is_train:
            # G pixel loss
            if train_opt['pixel_weight'] > 0:
                l_pix_type = train_opt['pixel_criterion']
                if l_pix_type == 'l1':
                    self.cri_pix = nn.L1Loss().to(self.device)
                elif l_pix_type == 'l2':
                    self.cri_pix = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_pix_type))
                self.l_pix_w = train_opt['pixel_weight']
            else:
                logger.info('Remove pixel loss.')
                self.cri_pix = None
            self.weight_kl = 1e-2
            self.weight_D = 1e-4
            self.l_gan_w = 1e-4

            # G feature loss
            if train_opt['feature_weight'] > 0:
                l_fea_type = train_opt['feature_criterion']
                if l_fea_type == 'l1':
                    self.cri_fea = nn.L1Loss().to(self.device)
                elif l_fea_type == 'l2':
                    self.cri_fea = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_fea_type))
                self.l_fea_w = train_opt['feature_weight']
            else:
                logger.info('Remove feature loss.')
                self.cri_fea = None
            if self.cri_fea:  # load VGG perceptual loss
                self.netF = networks.define_F(opt, use_bn=False,Rlu=True).to(self.device)   #Rlu=True if feature taken before relu, else false

            self.cri_gan = GANLoss(train_opt['gan_type'], 1.0, 0.0).to(self.device)
            # optimizers
            # G
            wd_G = train_opt['weight_decay_G'] if train_opt['weight_decay_G'] else 0
            optim_params = []
            for k, v in self.netG.named_parameters():  # can optimize for a part of the model
                if v.requires_grad:
                    optim_params.append(v)
                else:
                    logger.warning('Params [{:s}] will not optimize.'.format(k))
            self.optimizer_G = torch.optim.Adam(optim_params, lr=train_opt['lr_G'], \
                weight_decay=wd_G, betas=(train_opt['beta1_G'], 0.999))
            self.optimizers.append(self.optimizer_G)

            #D
            wd_D = train_opt['weight_decay_D'] if train_opt['weight_decay_D'] else 0
            self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=train_opt['lr_D'], \
                weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999))
            self.optimizers.append(self.optimizer_D)

            self.optimizer_V = torch.optim.Adam(self.netV.parameters(), lr=train_opt['lr_D'], \
                weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999))
            self.optimizers.append(self.optimizer_V)

            # schedulers
            if train_opt['lr_scheme'] == 'MultiStepLR':
                for optimizer in self.optimizers:
                    self.schedulers.append(lr_scheduler.MultiStepLR(optimizer, \
                        train_opt['lr_steps'], train_opt['lr_gamma']))
            else:
                raise NotImplementedError('MultiStepLR learning rate scheme is enough.')

            self.log_dict = OrderedDict()
        # print network
        self.print_network()
if __name__ == '__main__':
    args.workspace = os.path.join(args.workspace, args.exp_name)
    os.makedirs(args.workspace, exist_ok=True)
    logger = setup_logger(os.path.join(args.workspace, 'train_icdar15_log'))

    criterion = Loss()
    device = torch.device("cuda")
    model = EAST()
    data_parallel = False
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
        data_parallel = True
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[args.epoch_iter // 2],
                                         gamma=0.1)

    # 先产生第一次的pseudo-label
    logger.info("loading pretrained model from ", args.resume)
    # model.load_state_dict(torch.load(args.resume))

    #
    # target domain
    trainset = ICDAR15(args.train_data, args.train_gt)
    train_loader_target = data.DataLoader(trainset,
                                          batch_size=args.batch_size,
                                          shuffle=True,
                                          num_workers=args.num_workers,
                                          drop_last=True)
Example #9
0
def main(opt):

    lr1 = opt.lr
    lr_steps = opt.lr_steps
    gpu_idx = opt.gpu_idx
    batch_size = opt.batch_size
    arch = opt.arch
    logdir = os.path.join(opt.logdir, arch)
    os.makedirs(logdir, exist_ok=True)
    n_epochs = opt.n_epochs

    train_writer = SummaryWriter(os.path.join(logdir, 'train'))
    test_writer = SummaryWriter(os.path.join(logdir, 'test'))

    if not gpu_idx == 999:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_idx)  # non-functional
        torch.cuda.set_device(0)
    device = torch.device("cuda:0")

    best_prec1 = 0

    res = 256
    center_crop = 224
    train_transform = transforms.Compose([
        transforms.Resize(res),
        transforms.CenterCrop(center_crop),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),  # because inpus dtype is PIL Image
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

    val_transform = transforms.Compose([
        transforms.Resize(res),
        transforms.CenterCrop(center_crop),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

    # Data loaders
    dataset_path = opt.dataset_path

    training_set = torchvision.datasets.ImageFolder(os.path.join(
        dataset_path, 'train'),
                                                    transform=train_transform)
    if opt.data_sampler == 'weighted':
        train_sampler_weights = make_weights_for_balanced_classes(
            training_set.imgs, len(training_set.classes))
        train_sampler_weights = torch.DoubleTensor(train_sampler_weights)
        train_sampler = torch.utils.data.sampler.WeightedRandomSampler(
            train_sampler_weights, len(train_sampler_weights))
        train_loader = data.DataLoader(training_set,
                                       sampler=train_sampler,
                                       batch_size=batch_size,
                                       num_workers=8,
                                       pin_memory=True)
    else:
        train_loader = data.DataLoader(training_set,
                                       batch_size=batch_size,
                                       num_workers=5,
                                       pin_memory=True)

    val_set = torchvision.datasets.ImageFolder(os.path.join(
        dataset_path, 'test'),
                                               transform=val_transform)
    val_loader = data.DataLoader(val_set,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=5,
                                 pin_memory=True)

    num_classes = 33
    if opt.arch == 'vgg':
        # use vgg architecture
        # model_ft = vgg19(pretrained=True)  ##### Model Structure Here
        model_ft = models.__dict__['vgg19'](pretrained=True)
        model_ft.classifier[6] = nn.Linear(
            4096,
            num_classes)  # change last layer to fit the number of classes
    elif 'resnet' in opt.arch:
        # use resnet architecture
        model_ft = models.__dict__[opt.arch](pretrained=True)
        if opt.arch == 'resnet18' or opt.arch == 'resnet34':
            model_ft.fc = nn.Linear(512, num_classes)
        elif opt.arch == 'resnet50' or opt.arch == 'resnet101' or opt.arch == 'resnet152':
            model_ft = models.__dict__[opt.arch](pretrained=True)
            model_ft.fc = nn.Linear(2048, num_classes)
    else:
        raise ValueError("unsupported architecture")

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model_ft = nn.DataParallel(model_ft)

    if opt.refine:
        if opt.refine_epoch == 0:
            raise ValueError(
                "You set the refine epoch to 0. No need to refine, just retrain."
            )
        refine_model_filename = os.path.join(
            logdir, 'classifier{}.pth'.format(opt.refine_epoch))
        model_ft.load_state_dict(torch.load(refine_model_filename))

    model_ft.to(device)
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(model_ft.parameters(),
                          lr=lr1,
                          momentum=0.9,
                          weight_decay=5e-4)
    scheduler = lr_scheduler.MultiStepLR(
        optimizer, milestones=opt.lr_steps,
        gamma=0.1)  # milestones in number of optimizer iterations

    refine_flag = True
    for epoch in range(1, n_epochs):
        if epoch <= opt.refine_epoch and opt.refine and refine_flag:
            scheduler.step()
            continue
        else:
            refine_flag = False
        # adjust_learning_rate(optimizer, epoch, lr1, lr_steps)

        train_fraction_done = 0.0

        test_batchind = -1
        test_fraction_done = 0.0
        test_enum = enumerate(val_loader, 0)

        # train for one epoch
        losses = utils.AverageMeter()
        top1 = utils.AverageMeter()
        top3 = utils.AverageMeter()

        model_ft.train()
        optimizer.zero_grad()

        train_num_batch = len(train_loader)
        test_num_batch = len(val_loader)

        for train_batchind, (im_data, im_class) in enumerate(train_loader):

            model_ft.train
            im_data = im_data.to(device)
            im_class = im_class.to(device)
            # batch_size = im_data.shape[0]

            optimizer.zero_grad()
            output = model_ft(im_data)

            # measure accuracy and record loss
            prec1, prec3 = utils.accuracy(output.data.detach(),
                                          im_class,
                                          topk=(1, 3))
            loss = criterion(output, im_class)
            loss.backward()

            # compute gradient and do SGD step
            optimizer.step()

            losses.update(loss.item(), im_data.size(0))
            top1.update(prec1.item(), im_data.size(0))
            top3.update(prec3.item(), im_data.size(0))

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@3 {top3.val:.3f} ({top3.avg:.3f})'.format(
                      epoch,
                      train_batchind + 1,
                      len(train_loader) + 1,
                      loss=losses,
                      top1=top1,
                      top3=top3))
            train_fraction_done = (train_batchind + 1) / train_num_batch
            train_writer.add_scalar('loss', losses.val,
                                    (epoch + train_fraction_done) *
                                    train_num_batch * batch_size)
            train_writer.add_scalar('top1', top1.val,
                                    (epoch + train_fraction_done) *
                                    train_num_batch * batch_size)
            train_writer.add_scalar('top3', top3.val,
                                    (epoch + train_fraction_done) *
                                    train_num_batch * batch_size)

            train_fraction_done = (train_batchind + 1) / train_num_batch

            # evaluate on a fraction of the validation set
            if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch:
                test_losses = utils.AverageMeter()
                test_top1 = utils.AverageMeter()
                test_top3 = utils.AverageMeter()

                # switch to evaluate mode
                model_ft.eval()
                test_batchind, (im_data, im_class) = next(test_enum)
                with torch.no_grad():
                    im_data = im_data.to(device)
                    im_class = im_class.to(device)

                    # compute output
                    output = model_ft(im_data)
                    test_loss = criterion(output, im_class)
                    # measure accuracy and record loss
                    prec1, prec3 = utils.accuracy(output.data,
                                                  im_class,
                                                  topk=(1, 3))
                    test_losses.update(test_loss.item(), im_data.size(0))
                    test_top1.update(prec1.item(), im_data.size(0))
                    test_top3.update(prec3.item(), im_data.size(0))
                    print('Test: [{0}/{1}]\t'
                          'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                          'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                          'Prec@3 {top3.val:.3f} ({top3.avg:.3f})\t'.format(
                              test_batchind,
                              len(val_loader),
                              loss=test_losses,
                              top1=test_top1,
                              top3=test_top3))
                    test_writer.add_scalar('loss', test_losses.val,
                                           (epoch + train_fraction_done) *
                                           train_num_batch * batch_size)
                    test_writer.add_scalar('top1', test_top1.val,
                                           (epoch + train_fraction_done) *
                                           train_num_batch * batch_size)
                    test_writer.add_scalar('top3', test_top3.val,
                                           (epoch + train_fraction_done) *
                                           train_num_batch * batch_size)
                    test_writer.add_scalar('lr',
                                           optimizer.param_groups[0]['lr'],
                                           (epoch + train_fraction_done) *
                                           train_num_batch * batch_size)
                test_fraction_done = (test_batchind + 1) / test_num_batch

        scheduler.step()

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        if (epoch + 1) % 2 == 0:
            # save model
            model_tmp = copy.deepcopy(model_ft.state_dict())
            model_ft.load_state_dict(model_tmp)
            torch.save(
                model_ft.state_dict(),
                os.path.join(logdir, 'classifier' + str(epoch) + '.pth'))
        if (is_best):
            model_tmp = copy.deepcopy(model_ft.state_dict())
            model_ft.load_state_dict(model_tmp)
            torch.save(model_ft.state_dict(),
                       os.path.join(logdir, 'best_classifier.pth'))
Example #10
0
                        batch_size=BATCH_SIZE ,
                        shuffle=True, num_workers=1)

## Initializing r, theta
P,Pall = gridRing(N)
Drr = abs(P)
Drr = torch.from_numpy(Drr).float()
Dtheta = np.angle(P)
Dtheta = torch.from_numpy(Dtheta).float()
# What and where is gamma

## Create the model
model = OFModel(Drr, Dtheta, T, PRE, gpu_id)
model.cuda(gpu_id)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[100,150], gamma=0.1) # if Kitti: milestones=[100,150], UCF [50,100]
loss_mse = nn.MSELoss()
start_epoch = 1

## If want to continue training from a checkpoint
if(load_ckpt):
    loadedcheckpoint = torch.load(ckpt_file)
    start_epoch = loadedcheckpoint['epoch']
    model.load_state_dict(loadedcheckpoint['state_dict'])
    optimizer.load_state_dict(loadedcheckpoint['optimizer'])

print("Training from epoch: ", start_epoch)
print('-' * 25)
start = time.time()

count = 0
Example #11
0
print(nets)

gpus = [int(gpu) for gpu in args.gpus.split(',')]
if len(gpus) > 1:
  print("Using GPUs {}.".format(gpus))
  for net in nets:
    net = nn.DataParallel(net, device_ids=gpus)

params = [{'params': net.parameters()} for net in nets]

solver = optim.Adam(
    params,
    lr=args.lr)

milestones = [int(s) for s in args.schedule.split(',')]
scheduler = LS.MultiStepLR(solver, milestones=milestones, gamma=args.gamma)

if not os.path.exists(args.model_dir):
  print("Creating directory %s." % args.model_dir)
  os.makedirs(args.model_dir)

############### Checkpoints ###############
def resume(model_name, index):
  names = ['encoder', 'binarizer', 'decoder', 'unet', 'd2']

  for net_idx, net in enumerate(nets):
    if net is not None:
      name = names[net_idx]
      checkpoint_path = '{}/{}_{}_{:08d}.pth'.format(
          args.model_dir, model_name, 
          name, index)
Example #12
0
def main(args):
    # ***** parameters *****
    batch_size = 8
    num_workers = 1
    num_epoch = 200
    resume_epoch = 0
    resume = False

    epoch_samples = 4421
    NUM_POINTS = 2048
    lr = 0.001
    weigh_decay = 1e-4
    milestones = [60]  # [30, 60]
    which_dir = args.dir
    OBJ_CLASS = [args.cat]

    # load data
    train_dataset = PartDataset(num_ptrs=NUM_POINTS,
                                plane_num=32,
                                class_choice=OBJ_CLASS,
                                random_selection=True,
                                random_jitter=True,
                                random_scale=True,
                                random_translation=False,
                                which_dir=which_dir,
                                split='trainval')
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=num_workers)

    test_dataset = PartDataset(num_ptrs=NUM_POINTS,
                               plane_num=32,
                               class_choice=OBJ_CLASS,
                               split='test',
                               which_dir=which_dir)
    test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=8,
                                                  shuffle=False,
                                                  num_workers=num_workers)

    print('Training set size:', len(train_dataset))
    print('Test set size:', len(test_dataset))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    seg_classes = train_dataset.seg_classes
    seg_label_to_cat = {}  # {0:Airplane, 1:Airplane, ...49:Table}
    for cat in seg_classes.keys():
        for label in seg_classes[cat]:
            seg_label_to_cat[label] = cat

    NUM_CLASS = len(seg_classes[OBJ_CLASS[0]])

    # ***** specify model and log output directory *****
    time_stamp = time.strftime("%Y%m%d-%H%M%S")

    curr_dir = os.path.dirname(os.path.abspath(__file__))
    model_out_dir = '/media/pwu/Data/saved_models/point_cloud/shapepart/RCNet/'
    log_out_dir = os.path.join(curr_dir, 'results')
    try:
        os.makedirs(log_out_dir)
    except OSError:
        pass

    save_model_dir_root = check_dir(
        os.path.join(
            '/media/pwu/Data/saved_models/point_cloud/shapepart/RCNet/',
            'save_' + str(which_dir)))
    save_model_dir_class = check_dir(
        os.path.join(save_model_dir_root, OBJ_CLASS[0]))
    save_model_dir = check_dir(os.path.join(save_model_dir_class, time_stamp))

    # ***** specify logger *****
    # log_dir = os.path.join(log_out_dir, 'log-' + time_stamp + '.txt')
    # logging.basicConfig(level=logging.INFO,
    #                     format='%(asctime)s %(message)s',
    #                     filename=log_dir,
    #                     filemode='w')
    save_weights_name = time_stamp

    # ***** build model *****
    classifier = EnsembleRCNet(device, which_dir, NUM_CLASS, NUM_POINTS)
    print(classifier)
    temp = sum(p.numel() for p in classifier.parameters() if p.requires_grad)
    print("num_parameter", temp)

    # ***** load existing model *****
    model_path = os.path.join(model_out_dir,
                              'cls_model_' + str(resume_epoch) + '.pth')
    if model_path != '' and resume is True:
        classifier.load_state_dict(torch.load(model_path))

    # ***** define optimizer *****
    optimizer = optim.Adam(classifier.parameters(),
                           lr=lr,
                           weight_decay=weigh_decay,
                           amsgrad=False)
    classifier.to(device)

    # ***** scheduler *****
    exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer,
                                                milestones=milestones,
                                                gamma=0.1)
    # exp_lr_scheduler = CosineLRWithRestarts(optimizer, batch_size, epoch_samples, restart_period=5, t_mult=2)

    num_batch = len(train_dataset) / batch_size

    if resume:
        start_epoch = resume_epoch + 1
    else:
        start_epoch = 0

    curr_shape_ious = None

    for epoch in range(start_epoch, num_epoch):
        exp_lr_scheduler.step()
        classifier.train()

        # statistic data
        single_shape_ious = []

        for b, data in enumerate(train_dataloader):
            target, points, quantiles, ori_points_num, gather_idx, ori_point_idx = data
            target = target - seg_classes[OBJ_CLASS[0]][0]

            target = target.to(device)
            points = points.to(device)

            # ***************************************************************
            # first, prepare the input to rnn
            seq_data, seq_len, inverse_index = prepare_input_first_level(
                points, quantiles)
            seq_data = torch.from_numpy(seq_data.astype(np.float32))
            seq_data = seq_data.to(device)

            # next, prepare for the data index for convolution
            batch_num = quantiles.shape[0]
            plane_num = quantiles.shape[1]
            items_indices = np.array([], dtype=np.int32)
            cnt = 0
            for i in range(batch_num):
                plane_slice = []
                for j in range(plane_num):
                    item = []
                    for k in range(plane_num):
                        num = quantiles[i, j, k]
                        if num != 0:
                            items_indices = np.append(items_indices, cnt)
                        cnt = cnt + 1
            # ***************************************************************

            optimizer.zero_grad()
            pred = classifier(points, quantiles, seq_data, seq_len,
                              inverse_index, items_indices, gather_idx,
                              ori_point_idx)

            loss = F.cross_entropy(
                pred.view(-1, NUM_CLASS), target.view(-1)
            )  # should use nll_loss, but seems like there is no difference?
            loss.backward()
            optimizer.step()

            # compute ious
            cur_pred_val_logits = pred.data.cpu().numpy()
            cur_pred_val = np.zeros(
                (pred.size(0), NUM_POINTS)).astype(np.int32)

            ori_points_num = ori_points_num.numpy().squeeze().tolist()
            target = target.data.cpu().numpy()
            for i in range(pred.size(0)):
                logits = cur_pred_val_logits[i, :, :]
                cur_pred_val[i, 0:ori_points_num[i]] = np.argmax(
                    logits, 1)[0:ori_points_num[i]]

            for i in range(pred.size(0)):
                segp = cur_pred_val[i, 0:ori_points_num[i]]
                segl = target[i, 0:ori_points_num[i]]
                cat = OBJ_CLASS[0]
                part_ious = [0.0 for _ in range(NUM_CLASS)]
                for l in range(NUM_CLASS):
                    if (np.sum(segl == l) == 0) and (
                            np.sum(segp == l) == 0
                    ):  # part is not present, no prediction as well
                        part_ious[l] = 1.0
                    else:
                        part_ious[l] = np.sum(
                            (segl == l) & (segp == l)) / float(
                                np.sum((segl == l) | (segp == l)))
                single_shape_ious.append(np.mean(part_ious))

            curr_shape_ious = np.mean(single_shape_ious)

            msg = '[{0:d}: {1:d}/{2:d}] mean IoUs: {3:f}'.format(
                epoch, b, trunc(num_batch), curr_shape_ious)
            print(msg)

        curr_shape_ious = np.mean(single_shape_ious)
        msg = '*** train epoch {}, mean IoUs: {}'.format(
            epoch, curr_shape_ious)
        # logging.info(msg)
        print(msg)

        # evaluate
        single_shape_ious = []

        classifier.eval()
        ttime = []
        for b, data in enumerate(test_dataloader):
            target, points, quantiles, ori_points_num, gather_idx, ori_point_idx = data
            target = target - seg_classes[OBJ_CLASS[0]][0]

            target = target.to(device)
            points = points.to(device)

            # ***************************************************************
            # first, prepare the input to rnn
            seq_data, seq_len, inverse_index = prepare_input_first_level(
                points, quantiles)
            seq_data = torch.from_numpy(seq_data.astype(np.float32))
            seq_data = seq_data.to(device)

            # next, prepare for the data index for convolution
            batch_num = quantiles.shape[0]
            plane_num = quantiles.shape[1]
            items_indices = np.array([], dtype=np.int32)
            cnt = 0
            for i in range(batch_num):
                plane_slice = []
                for j in range(plane_num):
                    item = []
                    for k in range(plane_num):
                        num = quantiles[i, j, k]
                        if num != 0:
                            items_indices = np.append(items_indices, cnt)
                        cnt = cnt + 1
            # ***************************************************************
            start = timeit.default_timer()
            pred = classifier(points, quantiles, seq_data, seq_len,
                              inverse_index, items_indices, gather_idx,
                              ori_point_idx)
            stop = timeit.default_timer()
            print("time >>", stop - start)
            ttime.append(stop - start)

            # compute ious
            cur_pred_val_logits = pred.data.cpu().numpy()
            cur_pred_val = np.zeros(
                (pred.size(0), NUM_POINTS)).astype(np.int32)

            ori_points_num = ori_points_num.numpy().squeeze().tolist()
            target = target.data.cpu().numpy()
            for i in range(pred.size(0)):
                logits = cur_pred_val_logits[i, :, :]
                cur_pred_val[i, 0:ori_points_num[i]] = np.argmax(
                    logits, 1)[0:ori_points_num[i]]

            for i in range(pred.size(0)):
                segp = cur_pred_val[i, 0:ori_points_num[i]]
                segl = target[i, 0:ori_points_num[i]]
                cat = OBJ_CLASS[0]
                part_ious = [0.0 for _ in range(NUM_CLASS)]
                for l in range(NUM_CLASS):
                    if (np.sum(segl == l) == 0) and (
                            np.sum(segp == l) == 0
                    ):  # part is not present, no prediction as well
                        part_ious[l] = 1.0
                    else:
                        part_ious[l] = np.sum(
                            (segl == l) & (segp == l)) / float(
                                np.sum((segl == l) | (segp == l)))
                single_shape_ious.append(np.mean(part_ious))

        curr_shape_ious = np.mean(single_shape_ious)

        msg = '*** Test mean IoUs: {0:f}'.format(curr_shape_ious)
        # logging.info(msg)
        print(msg)

        #if epoch % 10 == 0:
        # torch.save(classifier.state_dict(), '{}/{}.pth'.format(save_model_dir, curr_shape_ious))

        # logging.info(msg)

        # torch.save(classifier.state_dict(), '%s/cls_model_%d.pth' % (model_out_dir, epoch))

    return curr_shape_ious
Example #13
0
    # {'params': model.classifierD0.parameters(), 'lr': opt.lr},

    # {'params': model.classifierB3.parameters(), 'lr': opt.lr},
    # {'params': model.classifierB4.parameters(), 'lr': opt.lr},
    # {'params': model.classifierB5.parameters(), 'lr': opt.lr},

    # {'params': model.classifierC2.parameters(), 'lr': opt.lr},
    # {'params': model.classifierC3.parameters(), 'lr': opt.lr},

    # {'params': model.classifier.parameters(), 'lr': opt.lr},

    # ], weight_decay=5e-4, momentum=0.9, nesterov=True)

# Decay LR by a factor of 0.1 every 40 epochs
exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer,
                                            milestones=[40, 70],
                                            gamma=0.1)

######################################################################
# Train and evaluate
# --------
#

dir_name = os.path.join('./logs', opt.name)
if not os.path.isdir(dir_name):
    os.mkdir(dir_name)
# record every run
copyfile('./train_irid.py', dir_name + '/train_irid.py')
copyfile('models/base_model.py', dir_name + '/base_model.py')
if opt.LSTM:
    copyfile('models/lstm_model.py', dir_name + '/lstm_model.py')
    model_ft = models.resnet18(pretrained=True)
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, 6)

    if use_gpu:
        model_ft = model_ft.cuda()

    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    # exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
    exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, [55, 75],
                                                gamma=0.5,
                                                last_epoch=-1)
    '''
    Result recording
    '''
    result_dir = 'result/1234'
    # folder
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    # image sample display

    model_ft = train_model(model_ft,
                           optimizer_ft,
                           exp_lr_scheduler,
                           num_epochs=100)
Example #15
0
    def __init__(self, opt):
        super(SRFeatModel, self).__init__(opt)
        train_opt = opt['train']

        # define networks and load pretrained models
        self.netG = networks.define_G(opt).to(self.device)  # G
        if self.is_train:
            self.netD1 = networks.define_D(opt).to(self.device)  # D
            self.netD2 = networks.define_DF(opt).to(self.device)  # D
            self.netG.train()
            self.netD1.train()
            self.netD2.train()
        self.load()  # load G and D if needed

        # define losses, optimizer and scheduler
        if self.is_train:
            # G pixel loss
            if train_opt['pixel_weight'] > 0:
                l_pix_type = train_opt['pixel_criterion']
                if l_pix_type == 'l1':
                    self.cri_pix = nn.L1Loss().to(self.device)
                elif l_pix_type == 'l2':
                    self.cri_pix = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_pix_type))
                self.l_pix_w = train_opt['pixel_weight']
            else:
                logger.info('Remove pixel loss.')
                self.cri_pix = None

            # G feature loss
            if train_opt['feature_weight'] > 0:
                l_fea_type = train_opt['feature_criterion']
                if l_fea_type == 'l1':
                    self.cri_fea = nn.L1Loss().to(self.device)
                elif l_fea_type == 'l2':
                    self.cri_fea = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_fea_type))
                self.l_fea_w = train_opt['feature_weight']
            else:
                logger.info('Remove feature loss.')
                self.cri_fea = None
            if self.cri_fea:  # load VGG perceptual loss
                self.netF = networks.define_F(opt, use_bn=False).to(self.device)

            # GD gan loss
            self.cri_gan = GANLoss(train_opt['gan_type'], 1.0, 0.0).to(self.device)
            self.l_gan_w = train_opt['gan_weight']
            # D_update_ratio and D_init_iters are for WGAN
            self.D_update_ratio = train_opt['D_update_ratio'] if train_opt['D_update_ratio'] else 1
            self.D_init_iters = train_opt['D_init_iters'] if train_opt['D_init_iters'] else 0

            if train_opt['gan_type'] == 'wgan-gp':
                self.random_pt = torch.Tensor(1, 1, 1, 1).to(self.device)
                # gradient penalty loss
                self.cri_gp = GradientPenaltyLoss(device=self.device).to(self.device)
                self.l_gp_w = train_opt['gp_weigth']

            # optimizers
            # G
            wd_G = train_opt['weight_decay_G'] if train_opt['weight_decay_G'] else 0
            optim_params = []
            for k, v in self.netG.named_parameters():  # can optimize for a part of the model
                if v.requires_grad:
                    optim_params.append(v)
                else:
                    logger.warning('Params [{:s}] will not optimize.'.format(k))
            self.optimizer_G = torch.optim.Adam(optim_params, lr=train_opt['lr_G'], \
                weight_decay=wd_G, betas=(train_opt['beta1_G'], 0.999))
            self.optimizers.append(self.optimizer_G)
            # D1 and D2
            wd_D = train_opt['weight_decay_D'] if train_opt['weight_decay_D'] else 0
            self.optimizer_D1 = torch.optim.Adam(self.netD1.parameters(), lr=train_opt['lr_D'], \
                weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999))
            self.optimizers.append(self.optimizer_D1)
            self.optimizer_D2 = torch.optim.Adam(self.netD2.parameters(), lr=train_opt['lr_D'], \
                weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999))
            self.optimizers.append(self.optimizer_D2)

            # schedulers
            if train_opt['lr_scheme'] == 'MultiStepLR':
                for optimizer in self.optimizers:
                    self.schedulers.append(lr_scheduler.MultiStepLR(optimizer, \
                        train_opt['lr_steps'], train_opt['lr_gamma']))
            else:
                raise NotImplementedError('MultiStepLR learning rate scheme is enough.')

            self.log_dict = OrderedDict()
        # print network
        self.print_network()
Example #16
0
#checkpoint = torch.load('./model/model_mulstep.pth')
net = googlenet()
net = nn.DataParallel(net)

#net.load_state_dict(checkpoint)
#net.load_state_dict(torch.load('./model/model_mulstep.pth'))
#net = torch.load('./model/model_mulstep.pth')
#net = nn.DataParallel(net)
net.cuda()
# 定义损失函数和优化方式
criterion = nn.CrossEntropyLoss()  #损失函数为交叉熵,多用于多分类问题
optimizer = optim.SGD(
    net.parameters(), lr=LR, momentum=0.9,
    weight_decay=5e-4)  #优化方式为mini-batch momentum-SGD,并采用L2正则化(权重衰减)
lr_schedule = lr_scheduler.MultiStepLR(optimizer, [60, 80], 0.1)
#lr_schedule = lr_scheduler.ExponentialLR(optimizer,gamma=1)

# 训练
if __name__ == "__main__":
    #if not os.path.exists(args.outf):
    #os.makedirs(args.outf)
    best_acc = 0  #2 初始化best test accuracy
    print("Start Training, googlenet!")  # 定义遍历数据集的次数
    with open("ci-acc16.txt", "w") as f:
        with open("ci-log16.txt", "w") as f2:
            for epoch in range(0, EPOCH):
                print('\nEpoch: %d' % (epoch + 1))
                net.train()
                sum_loss = 0.0
                correct = 0.0
Example #17
0
def experiment(args):
    # load data
    data = pd.read_pickle(os.path.join(args.dataset_root, 'dataset.pkl'))
    data = data[data.sensor.str.contains('|'.join(args.sensors))]  # filter sensors

    # load splits
    splits = pd.read_csv(os.path.join(args.dataset_root, 'train_test_split.csv'))
    train_patients = splits[splits.split.str.contains('train')].patient_hash.tolist()
    test_patients = splits[splits.split.str.contains('test')].patient_hash.tolist()

    # get data accorting to patient split
    train_data = data[data.patient_hash.str.contains('|'.join(train_patients))]
    test_data = data[data.patient_hash.str.contains('|'.join(test_patients))]

    # subset the dataset
    train_dataset = COVID19Dataset(args, train_data, get_transforms(args, 'train'))
    test_dataset = COVID19Dataset(args, test_data, get_transforms(args, 'test'))

    # For unbalanced dataset we create a weighted sampler
    train_labels = [sum(l) for l in train_data.label.tolist()]
    weights = get_weights_for_balanced_classes(train_labels, len(list(set(train_labels))))
    weights = torch.DoubleTensor(weights)
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=weights, num_samples=len(weights))

    nclasses = len(list(set(train_labels)))
    # dataloaders from subsets
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        sampler=sampler,
        num_workers=args.num_workers,
        drop_last=True)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        sampler=None,
        num_workers=args.num_workers,
        drop_last=False)

    # create directories
    args.weights_dir = os.path.join('logs', args.run_name, 'weights')
    os.makedirs(args.weights_dir, exist_ok=True)
    args.train_viz_dir = os.path.join('logs', args.run_name, 'viz_train')
    os.makedirs(args.train_viz_dir, exist_ok=True)
    args.test_viz_dir = os.path.join('logs', args.run_name, 'viz_test')
    os.makedirs(args.test_viz_dir, exist_ok=True)

    model = CNNConStn(args.img_size, nclasses, args.fixed_scale)
    print(model)
    print('Number of params in the model: {}'.format(
        *[sum([p.data.nelement() for p in net.parameters()]) for net in [model]]))
    model = model.cuda()

    # fixed samples for stn visualization
    fixed_samples_iter = iter(train_loader)
    fixed_samples_train, fixed_y_train = fixed_samples_iter.next()
    fixed_samples_iter = iter(test_loader)
    fixed_samples_test, _ = fixed_samples_iter.next()

    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)
    exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[70], gamma=0.1) # 10, 50
    state_dict = {'best_f1': 0., 'precision': 0., 'recall': 0., 'accuracy': 0.}
    for epoch in range(args.epochs):
        model = train(args, model, train_loader, len(list(set(train_labels))), optimizer, epoch,
                      fixed_samples_train, fixed_y_train)
        test(args, model, test_loader, len(list(set(train_labels))), epoch, state_dict, args.weights_dir,
             fixed_samples_test)
        exp_lr_scheduler.step()
Example #18
0
def train(opt):
    """ dataset preparation """
    if opt.select_data == 'baidu':
        train_set = BAIDUset(opt, opt.train_csv)
        train_loader = torch.utils.data.DataLoader(
            train_set,
            batch_size=opt.batch_size,
            shuffle=True,
            num_workers=int(opt.workers),
            collate_fn=BaiduCollate(opt.imgH, opt.imgW, keep_ratio=False))
        val_set = BAIDUset(opt, opt.val_csv)
        valid_loader = torch.utils.data.DataLoader(
            val_set,
            batch_size=opt.batch_size,
            shuffle=True,
            num_workers=int(opt.workers),
            collate_fn=BaiduCollate(opt.imgH, opt.imgW, keep_ratio=False),
            pin_memory=True)

    else:
        opt.select_data = opt.select_data.split('-')
        opt.batch_ratio = opt.batch_ratio.split('-')
        train_dataset = Batch_Balanced_Dataset(opt)

        AlignCollate_valid = AlignCollate(imgH=opt.imgH,
                                          imgW=opt.imgW,
                                          keep_ratio_with_pad=opt.PAD)
        valid_dataset = hierarchical_dataset(root=opt.valid_data, opt=opt)
        valid_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size=opt.batch_size,
            shuffle=
            True,  # 'True' to check training progress with validation function.
            num_workers=int(opt.workers),
            collate_fn=AlignCollate_valid,
            pin_memory=True)
    print('-' * 80)
    """ model configuration """
    if 'CTC' in opt.Prediction:
        converter = CTCLabelConverter(opt.character)
    elif 'Bert' in opt.Prediction:
        converter = TransformerConverter(opt.character, opt.max_seq)
    elif 'SRN' in opt.Prediction:
        converter = SRNConverter(opt.character, opt.SRN_PAD)
    else:
        converter = AttnLabelConverter(opt.character)
    opt.num_class = len(converter.character)

    if opt.rgb:
        opt.input_channel = 3
    model = Model(opt)
    print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial,
          opt.input_channel, opt.output_channel, opt.hidden_size,
          opt.num_class, opt.batch_max_length, opt.Transformation,
          opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction)

    # weight initialization
    for name, param in model.named_parameters():
        if 'localization_fc2' in name:
            print(f'Skip {name} as it is already initialized')
            continue
        try:
            if 'bias' in name:
                init.constant_(param, 0.0)
            elif 'weight' in name:
                init.kaiming_normal_(param)
        except Exception as e:  # for batchnorm.
            if 'weight' in name:
                param.data.fill_(1)
            continue

    # data parallel for multi-GPU
    model = torch.nn.DataParallel(model).cuda()
    model.train()
    if opt.continue_model != '':
        print(f'loading pretrained model from {opt.continue_model}')
        model.load_state_dict(torch.load(opt.continue_model))
    print("Model:")
    print(model)
    """ setup loss """
    if 'CTC' in opt.Prediction:
        criterion = torch.nn.CTCLoss(zero_infinity=True).cuda()
    elif 'Bert' in opt.Prediction:
        criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda()
    elif 'SRN' in opt.Prediction:
        criterion = cal_performance
    else:
        criterion = torch.nn.CrossEntropyLoss(
            ignore_index=0).cuda()  # ignore [GO] token = ignore index 0
    # loss averager
    loss_avg = Averager()

    # filter that only require gradient decent
    filtered_parameters = []
    params_num = []
    for p in filter(lambda p: p.requires_grad, model.parameters()):
        filtered_parameters.append(p)
        params_num.append(np.prod(p.size()))
    print('Trainable params num : ', sum(params_num))
    # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())]

    # setup optimizer
    if opt.adam:
        optimizer = optim.Adam(filtered_parameters,
                               lr=opt.lr,
                               betas=(opt.beta1, 0.999))
    elif opt.ranger:
        optimizer = Ranger(filtered_parameters, lr=opt.lr)
    else:
        optimizer = optim.Adadelta(filtered_parameters,
                                   lr=opt.lr,
                                   rho=opt.rho,
                                   eps=opt.eps)
    print("Optimizer:")
    print(optimizer)

    lrScheduler = lr_scheduler.MultiStepLR(optimizer, [5, 20, 30],
                                           gamma=0.5)  # 减小学习速率
    """ final options """
    # print(opt)
    with open(f'./saved_models/{opt.experiment_name}/opt.txt',
              'a') as opt_file:
        opt_log = '------------ Options -------------\n'
        args = vars(opt)
        for k, v in args.items():
            opt_log += f'{str(k)}: {str(v)}\n'
        opt_log += '---------------------------------------\n'
        print(opt_log)
        opt_file.write(opt_log)
    """ start training """
    start_iter = 0
    if opt.continue_model != '':
        start_iter = int(opt.continue_model.split('_')[-1].split('.')[0])
        print(f'continue to train, start_iter: {start_iter}')

    start_time = time.time()
    best_accuracy = -1
    best_norm_ED = 1e+6
    i = start_iter
    if opt.select_data == 'baidu':
        train_iter = iter(train_loader)
        step_per_epoch = len(train_set) / opt.batch_size
        print('一代有多少step:', step_per_epoch)
    else:
        step_per_epoch = train_dataset.nums_samples / opt.batch_size
        print('一代有多少step:', step_per_epoch)

    while (True):
        # try:
        # train part
        for p in model.parameters():
            p.requires_grad = True

        if opt.select_data == 'baidu':
            try:
                image_tensors, labels = train_iter.next()
            except:
                train_iter = iter(train_loader)
                image_tensors, labels = train_iter.next()
        else:
            image_tensors, labels = train_dataset.get_batch()

        image = image_tensors.cuda()
        text, length = converter.encode(labels)
        batch_size = image.size(0)

        if 'CTC' in opt.Prediction:
            preds = model(image, text).log_softmax(2)
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            preds = preds.permute(1, 0, 2)  # to use CTCLoss format
            cost = criterion(preds, text, preds_size, length)

        elif 'Bert' in opt.Prediction:
            pad_mask = None
            # print(image.shape)
            preds = model(image, pad_mask)
            cost = criterion(preds[0].view(-1, preds[0].shape[-1]), text.contiguous().view(-1)) + \
                   criterion(preds[1].view(-1, preds[1].shape[-1]), text.contiguous().view(-1))

        elif 'SRN' in opt.Prediction:
            preds = model(image, None)
            cost, n_correct = criterion(preds, text)

        else:
            preds = model(image, text[:, :-1])  # align with Attention.forward
            target = text[:, 1:]  # without [GO] Symbol
            cost = criterion(preds.view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))

        model.zero_grad()
        cost.backward()
        torch.nn.utils.clip_grad_norm_(
            model.parameters(),
            opt.grad_clip)  # gradient clipping with 5 (Default)
        optimizer.step()

        loss_avg.add(cost)

        if i % opt.disInterval == 0:
            elapsed_time = time.time() - start_time
            print(
                f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}'
            )
            start_time = time.time()

        # validation part
        if i % opt.valInterval == 0 and i > start_iter:
            elapsed_time = time.time() - start_time
            print(
                f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}'
            )
            # for log
            with open(f'./saved_models/{opt.experiment_name}/log_train.txt',
                      'a') as log:
                log.write(
                    f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n'
                )
                loss_avg.reset()

                model.eval()
                valid_loss, current_accuracy, current_norm_ED, preds, labels, infer_time, length_of_data = validation(
                    model, criterion, valid_loader, converter, opt)
                model.train()

                for pred, gt in zip(preds[:5], labels[:5]):
                    if 'Attn' in opt.Prediction:
                        pred = pred[:pred.find('[s]')]
                        gt = gt[:gt.find('[s]')]
                    print(
                        f'pred: {pred:20s}, gt: {gt:20s},   {str(pred == gt)}')
                    log.write(
                        f'pred: {pred:20s}, gt: {gt:20s},   {str(pred == gt)}\n'
                    )

                valid_log = f'[{i}/{opt.num_iter}] valid loss: {valid_loss:0.5f}'
                valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}'
                print(valid_log)
                log.write(valid_log + '\n')

                # keep best accuracy model
                if current_accuracy > best_accuracy:
                    best_accuracy = current_accuracy
                    torch.save(
                        model.state_dict(),
                        f'./saved_models/{opt.experiment_name}/best_accuracy.pth'
                    )
                if current_norm_ED < best_norm_ED:
                    best_norm_ED = current_norm_ED
                    torch.save(
                        model.state_dict(),
                        f'./saved_models/{opt.experiment_name}/best_norm_ED.pth'
                    )
                best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}'
                print(best_model_log)
                log.write(best_model_log + '\n')

        # save model per 1e+5 iter.
        if (i + 1) % opt.saveInterval == 0:
            torch.save(model.state_dict(),
                       f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth')

        if i == opt.num_iter:
            print('end the training')
            sys.exit()

        if i > 0 and i % step_per_epoch == 0:  # 调整学习速率
            lrScheduler.step()

        i += 1
Example #19
0
    model_weights = os.listdir(save_root)
    model_weights.remove("optimizer.pt")
    model_weights.sort()
    last_model_name = model_weights[-1]

    model.load_state_dict(torch.load(PJ(save_root, last_model_name)))
    optimizer.load_state_dict(torch.load(PJ(save_root, "optimizer.pt")))
    print(f"Loading model {last_model_name} successed!")
    print(f"Loading optimizer successed!")
    iteration, last_epoch = int(last_model_name[-11:-3]), int(
        last_model_name[:4])
iteration, last_epoch = (iteration, last_epoch) if resume else (0, 1)

# Learning rate decay scheduler
scheduler = lr_scheduler.MultiStepLR(optimizer,
                                     config["step_size"],
                                     config["gamma"],
                                     last_epoch=last_epoch - 2)

########################################
# Start training
########################################
print("\n> Training")
for epoch in range(last_epoch, config["max_epoch"] + 1):
    # scheduler step in each epoch
    scheduler.step()
    for it, (labels, images) in enumerate(trainloader):
        optimizer.zero_grad()

        # Drop images and labels into GPU
        images = images.cuda().detach()
        labels = labels.cuda().detach()
model = NormalizedModel(model=m, mean=image_mean, std=image_std).to(
    DEVICE)  # keep images in the [0, 1] range
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

optimizer = SGD(model.parameters(),
                lr=args.lr,
                momentum=args.momentum,
                weight_decay=args.weight_decay)
if args.adv == 0:
    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=args.lr_step,
                                    gamma=args.lr_decay)
else:
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[60, 120, 160],
                                         gamma=0.2)

attacker = DDN(steps=args.steps, device=DEVICE)

max_loss = torch.log(torch.tensor(10.)).item()  # for callback
best_acc = 0
best_epoch = 0

for epoch in range(args.epochs):
    scheduler.step()
    cudnn.benchmark = True
    model.train()
    requires_grad_(m, True)
    accs = AverageMeter()
    losses = AverageMeter()
Example #21
0
def train(cfg):
    # prepare dataset
    train_loader, val_loader, num_query, num_classes, num_classes2, image_map_label2 = make_data_loader(
        cfg)

    #print('\n\n*** image_map_label2:')

    # prepare model
    model = build_model(cfg, num_classes, num_classes2)
    #print(list(model.children()))
    #print(model.state_dict().keys())
    #exit(0)

    #print('model.named_children(): \n\n', model.named_children())
    '''
    kk = 1
    for name, child in model.base.named_children():
        print(kk, name)
        kk += 1
    print(len(list(model.base.children())))
    exit(0)
    for i in range(len(list(model.base.children()))):
        print('  +++', i+1)
        print(list(model.base.children())[i])
    exit(0)
    '''

    if len(cfg.MODEL.PRETRAIN_PATH2) > 5:
        print('--- resume from ', cfg.MODEL.PRETRAIN_PATH2)
        #model.load_param(cfg.MODEL.PRETRAIN_PATH)
        #model.loiad_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH2, map_location=lambda storage, loc: storage))
        if cfg.MODEL.ONCE_LOAD == 'yes':
            print('\n---ONCE_LOAD...\n')
            model.load_state_dict(
                torch.load(cfg.MODEL.PRETRAIN_PATH2,
                           map_location=lambda storage, loc: storage))
            #if cfg.MODEL.FREEZE_BASE == 'yes':
            #    functions.freeze_layer(model, 'base', False)
            #functions.freeze_global_model(model, False)
        else:
            functions.load_state_dict_distill(model, cfg.MODEL.PRETRAIN_PATH2,
                                              cfg.MODEL.ONLY_BASE,
                                              cfg.MODEL.WITHOUT_FC)
        print('**** Successfully load ', cfg.MODEL.PRETRAIN_PATH2)
        if cfg.MODEL.FREEZE_BASE:
            #functions.freeze_layer(model, 'base', False)
            functions.freeze_global_model(model, False)

    if cfg.MODEL.IF_WITH_CENTER == 'no':
        print('Train without center loss, the loss type is',
              cfg.MODEL.METRIC_LOSS_TYPE)
        if cfg.SOLVER.MY_OPTIMIZER == "yes":
            print('---* my optimizer:', cfg.SOLVER.MY_OPTIMIZER_NAME)
            other_params = [
                p for n, p in model.named_parameters()
                if not n.startswith('base')
            ]
            optimizer = optim.SGD([{
                'params': model.base.parameters(),
                'lr': cfg.SOLVER.LR / 10
            }, {
                'params': other_params,
                'lr': cfg.SOLVER.LR
            }],
                                  momentum=0.9,
                                  weight_decay=5e-4,
                                  nesterov=True)
        else:
            print('---* not my optimizer')
            optimizer = make_optimizer(cfg, model)

        # scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
        #                               cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD)

        #_C.SOLVER.MY_SCHEDULER = "no"
        #_C.SOLVER.MY_WARMUP = "no"
        loss_func = make_loss(cfg, num_classes)  # modified by gu

        # Add for using self trained model
        if cfg.MODEL.PRETRAIN_CHOICE == 'self':
            start_epoch = eval(
                cfg.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_')
                [-1])
            print('Start epoch:', start_epoch)
            path_to_optimizer = cfg.MODEL.PRETRAIN_PATH.replace(
                'model', 'optimizer')
            print('Path to the checkpoint of optimizer:', path_to_optimizer)
            model.load_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH))
            optimizer.load_state_dict(torch.load(path_to_optimizer))
            scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS,
                                          cfg.SOLVER.GAMMA,
                                          cfg.SOLVER.WARMUP_FACTOR,
                                          cfg.SOLVER.WARMUP_ITERS,
                                          cfg.SOLVER.WARMUP_METHOD,
                                          start_epoch)
        elif cfg.MODEL.PRETRAIN_CHOICE == 'imagenet':
            start_epoch = 0
            if cfg.SOLVER.MY_SCHEDULER == "yes":
                print('cfg.SOLVER.MY_SCHEDULER_STEP:',
                      cfg.SOLVER.MY_SCHEDULER_STEP)
                print('---* my scheduler: ', cfg.SOLVER.MY_SCHEDULER_NAME)
                if cfg.SOLVER.MY_SCHEDULER_NAME == 'SL':
                    scheduler = lr_scheduler.StepLR(
                        optimizer,
                        step_size=cfg.SOLVER.MY_SCHEDULER_STEP[0],
                        gamma=0.1)
                elif cfg.SOLVER.MY_SCHEDULER_NAME == 'MSL':
                    scheduler = lr_scheduler.MultiStepLR(
                        optimizer, cfg.SOLVER.MY_SCHEDULER_STEP, gamma=0.1)
                else:
                    print(cfg.SOLVER.MY_SCHEDULER_NAME, ' not found!')
                    eixt(0)
            else:
                print('---* not my scheduler')
                scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS,
                                              cfg.SOLVER.GAMMA,
                                              cfg.SOLVER.WARMUP_FACTOR,
                                              cfg.SOLVER.WARMUP_ITERS,
                                              cfg.SOLVER.WARMUP_METHOD)
        else:
            print(
                'Only support pretrain_choice for imagenet and self, but got {}'
                .format(cfg.MODEL.PRETRAIN_CHOICE))

        arguments = {}

        print('************ do_train')
        do_train(
            cfg,
            model,
            train_loader,
            val_loader,
            optimizer,
            scheduler,  # modify for using self trained model
            loss_func,
            num_query,
            start_epoch,  # add for using self trained model
            image_map_label2,
            num_classes2)


#    elif cfg.MODEL.IF_WITH_CENTER == 'yes':
#        print('Train with center loss, the loss type is', cfg.MODEL.METRIC_LOSS_TYPE)
#        loss_func, center_criterion = make_loss_with_center(cfg, num_classes)  # modified by gu
#        optimizer, optimizer_center = make_optimizer_with_center(cfg, model, center_criterion)
#        # scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
#        #                               cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD)
#
#        arguments = {}
#
#        # Add for using self trained model
#        if cfg.MODEL.PRETRAIN_CHOICE == 'self':
#            start_epoch = eval(cfg.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_')[-1])
#            print('Start epoch:', start_epoch)
#            path_to_optimizer = cfg.MODEL.PRETRAIN_PATH.replace('model', 'optimizer')
#            print('Path to the checkpoint of optimizer:', path_to_optimizer)
#            path_to_optimizer_center = cfg.MODEL.PRETRAIN_PATH.replace('model', 'optimizer_center')
#            print('Path to the checkpoint of optimizer_center:', path_to_optimizer_center)
#            model.load_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH))
#            optimizer.load_state_dict(torch.load(path_to_optimizer))
#            optimizer_center.load_state_dict(torch.load(path_to_optimizer_center))
#            scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
#                                          cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD, start_epoch)
#        elif cfg.MODEL.PRETRAIN_CHOICE == 'imagenet':
#            start_epoch = 0
#            scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
#                                          cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD)
#        else:
#            print('Only support pretrain_choice for imagenet and self, but got {}'.format(cfg.MODEL.PRETRAIN_CHOICE))
#
#        do_train_with_center(
#            cfg,
#            model,
#            center_criterion,
#            train_loader,
#            val_loader,
#            optimizer,
#            optimizer_center,
#            scheduler,      # modify for using self trained model
#            loss_func,
#            num_query,
#            start_epoch     # add for using self trained model
#        )
    else:
        print(
            "Unsupported value for cfg.MODEL.IF_WITH_CENTER {}, only support yes or no!\n"
            .format(cfg.MODEL.IF_WITH_CENTER))
Example #22
0
    def train(self, train_queue, val_queue=None):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()
        training_losses = []

        # Setup learning rates
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        #Setup the lr_scheduler
        self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer,
                                                     lr_steps,
                                                     gamma=0.1)

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        # Main training loop
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            self.lr_scheduler.step()

            data_timer.tic()
            batch_img, batch_voxel = train_queue.get()
            data_timer.toc()

            if self.net.is_x_tensor4:
                batch_img = batch_img[0]

            # Apply one gradient step
            train_timer.tic()
            loss = self.train_loss(batch_img, batch_voxel)
            train_timer.toc()

            training_losses.append(loss.data[0])

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                #for pytorch optimizer, learning rate can only be set when the optimizer is created
                #or using torch.optim.lr_scheduler
                print('Learing rate decreased to %f: ' %
                      cfg.TRAIN.LEARNING_RATES[str(train_ind)])

            # Debugging modules
            #
            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:
                # Print the current loss
                print('%s Iter: %d Loss: %f' %
                      (datetime.now(), train_ind, loss))

            if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None:
                # Print test loss and params to check convergence every N iterations

                val_losses = 0
                for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS):
                    batch_img, batch_voxel = val_queue.get()
                    val_loss = self.train_loss(batch_img, batch_voxel)
                    val_losses += val_loss
                var_losses_mean = val_losses / cfg.TRAIN.NUM_VALIDATION_ITERATIONS
                print('%s Test loss: %f' % (datetime.now(), var_losses_mean))

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                nan_or_max_param = max_or_nan(self.net.parameters())
                if has_nan(nan_or_max_param):
                    print('NAN detected')
                    break

            if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0:
                self.save(training_losses, save_dir, train_ind)

            #loss is a Variable containing torch.FloatTensor of size 1
            if loss.data[0] > cfg.TRAIN.LOSS_LIMIT:
                print("Cost exceeds the threshold. Stop training")
                break
Example #23
0
                      )
        base_params = filter(lambda p: id(p) not in ignored_params, model.parameters())
        optimizer_ft = optim.SGD([
             {'params': base_params, 'lr': 0.1*opt.lr},
             {'params': model.classifier0.parameters(), 'lr': opt.lr},
             {'params': model.classifier1.parameters(), 'lr': opt.lr},
             {'params': model.classifier2.parameters(), 'lr': opt.lr},
             {'params': model.classifier3.parameters(), 'lr': opt.lr},
         ], weight_decay=5e-4, momentum=0.9, nesterov=True)

if opt.adam:
    optimizer_ft = optim.Adam(model.parameters(), opt.lr, weight_decay=5e-4)

# Decay LR by a factor of 0.1 every 40 epochs
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=40, gamma=0.1)
exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, milestones=[60-start_epoch, 75-start_epoch], gamma=0.1)

######################################################################
# Train and evaluate
# ^^^^^^^^^^^^^^^^^^
#
# It should take around 1-2 hours on GPU. 
#
dir_name = os.path.join('./data/outputs',name)

if not opt.resume:
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
#record every run
    copyfile('./train.py', dir_name+'/train.py')
    copyfile('./model.py', dir_name+'/model.py')
Example #24
0
num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
print('The number of parameters of model is', num_params)

if args.resume is not None:
    checkpoint = torch.load('./save_model/' + args.resume)
    net.load_state_dict(checkpoint['net'])

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(),
                      lr=args.lr,
                      momentum=args.momentum,
                      weight_decay=args.weight_decay)

decay_epoch = [150, 225]
scheduler = lr_scheduler.MultiStepLR(optimizer,
                                     milestones=decay_epoch,
                                     gamma=0.1)

writer = SummaryWriter(args.logdir)


def train(epoch, global_steps):
    net.train()

    train_loss = 0
    correct = 0
    total = 0

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        global_steps += 1
        inputs = inputs.to(device)
Example #25
0
def train():
    cfg = opt.cfg
    data = opt.data
    img_size = opt.img_size
    epochs = 1 if opt.prebias else opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
    batch_size = opt.batch_size
    accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
    weights = opt.weights  # initial training weights

    if 'pw' not in opt.arc:  # remove BCELoss positive weights
        hyp['cls_pw'] = 1.
        hyp['obj_pw'] = 1.

    # Initialize
    init_seeds()
    if opt.multi_scale:
        img_sz_min = round(img_size / 32 / 1.5)
        img_sz_max = round(img_size / 32 * 1.5)
        img_size = img_sz_max * 32  # initiate with maximum multi_scale size
        print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size))

    # Configure run
    data_dict = parse_data_cfg(data)
    train_path = data_dict['train']
    test_path = data_dict['valid']
    nc = int(data_dict['classes'])  # number of classes

    # Remove previous results
    for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
        os.remove(f)

    # Initialize model
    model = Darknet(cfg, arc=opt.arc).to(device)

    # Optimizer
    pg0, pg1 = [], []  # optimizer parameter groups
    for k, v in dict(model.named_parameters()).items():
        if 'Conv2d.weight' in k:
            pg1 += [v]  # parameter group 1 (apply weight_decay)
        else:
            pg0 += [v]  # parameter group 0

    if opt.adam:
        optimizer = optim.Adam(pg0, lr=hyp['lr0'])
        # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)
    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    del pg0, pg1

    # https://github.com/alphadl/lookahead.pytorch
    # optimizer = torch_utils.Lookahead(optimizer, k=5, alpha=0.5)

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_fitness = float('inf')
    attempt_download(weights)
    if weights.endswith('.pt'):  # pytorch format
        # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
        chkpt = torch.load(weights, map_location=device)

        # load model
        try:
            chkpt['model'] = {
                k: v
                for k, v in chkpt['model'].items()
                if model.state_dict()[k].numel() == v.numel()
            }
            model.load_state_dict(chkpt['model'], strict=False)
        except KeyError as e:
            s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \
                "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights)
            raise KeyError(s) from e

        # load optimizer
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_fitness = chkpt['best_fitness']

        # load results
        if chkpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(chkpt['training_results'])  # write results.txt

        start_epoch = chkpt['epoch'] + 1
        del chkpt

    elif len(weights) > 0:  # darknet format
        # possible weights are '*.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
        cutoff = load_darknet_weights(model, weights)

    if opt.transfer or opt.prebias:  # transfer learning edge (yolo) layers
        nf = int(model.module_defs[model.yolo_layers[0] -
                                   1]['filters'])  # yolo layer size (i.e. 255)

        if opt.prebias:
            for p in optimizer.param_groups:
                # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum
                p['lr'] *= 100  # lr gain
                if p.get('momentum') is not None:  # for SGD but not Adam
                    p['momentum'] *= 0.9

        for p in model.parameters():
            if opt.prebias and p.numel() == nf:  # train (yolo biases)
                p.requires_grad = True
            elif opt.transfer and p.shape[
                    0] == nf:  # train (yolo biases+weights)
                p.requires_grad = True
            else:  # freeze layer
                p.requires_grad = False

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8)  # gradual fall to 0.1*lr0
    scheduler = lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[round(opt.epochs * x) for x in [0.8, 0.9]],
        gamma=0.1)
    scheduler.last_epoch = start_epoch - 1

    # # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.ylabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Initialize distributed training
    if device.type != 'cpu' and torch.cuda.device_count() > 1:
        dist.init_process_group(
            backend='nccl',  # 'distributed backend'
            init_method=
            'tcp://127.0.0.1:9999',  # distributed training init method
            world_size=1,  # number of nodes for distributed training
            rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(
            model, find_unused_parameters=True)
        model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level

    # Dataset
    dataset = LoadImagesAndLabels(
        train_path,
        img_size,
        batch_size,
        augment=True,
        hyp=hyp,  # augmentation hyperparameters
        rect=opt.rect,  # rectangular training
        image_weights=opt.img_weights,
        cache_labels=epochs > 10,
        cache_images=opt.cache_images and not opt.prebias)

    # Dataloader
    batch_size = min(batch_size, len(dataset))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=nw,
        shuffle=not opt.
        rect,  # Shuffle=True unless rectangular training is used
        pin_memory=True,
        collate_fn=dataset.collate_fn)

    # Test Dataloader
    if not opt.prebias:
        testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(
            test_path,
            img_size,
            batch_size,
            hyp=hyp,
            rect=True,
            cache_labels=True,
            cache_images=opt.cache_images),
                                                 batch_size=batch_size,
                                                 num_workers=nw,
                                                 pin_memory=True,
                                                 collate_fn=dataset.collate_fn)

    # Start training
    nb = len(dataloader)
    model.nc = nc  # attach number of classes to model
    model.arc = opt.arc  # attach yolo architecture
    model.hyp = hyp  # attach hyperparameters to model
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(
        device)  # attach class weights
    maps = np.zeros(nc)  # mAP per class
    # torch.autograd.set_detect_anomaly(True)
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    t0 = time.time()
    torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
    print('Using %g dataloader workers' % nw)
    print('Starting %s for %g epochs...' %
          ('prebias' if opt.prebias else 'training', epochs))
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()
        print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls',
                                     'total', 'targets', 'img_size'))

        # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional)
        freeze_backbone = False
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        # Update image weights (optional)
        if dataset.image_weights:
            w = model.class_weights.cpu().numpy() * (1 -
                                                     maps)**2  # class weights
            image_weights = labels_to_image_weights(dataset.labels,
                                                    nc=nc,
                                                    class_weights=w)
            dataset.indices = random.choices(range(dataset.n),
                                             weights=image_weights,
                                             k=dataset.n)  # rand weighted idx

        mloss = torch.zeros(4).to(device)  # mean losses
        pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            targets = targets.to(device)

            # Multi-Scale training
            if opt.multi_scale:
                if ni / accumulate % 10 == 0:  #  adjust (67% - 150%) every 10 batches
                    img_size = random.randrange(img_sz_min,
                                                img_sz_max + 1) * 32
                sf = img_size / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [
                        math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]
                    ]  # new shape (stretched to 32-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Plot images with bounding boxes
            if ni == 0:
                fname = 'train_batch%g.jpg' % i
                plot_images(imgs=imgs,
                            targets=targets,
                            paths=paths,
                            fname=fname)
                if tb_writer:
                    tb_writer.add_image(fname,
                                        cv2.imread(fname)[:, :, ::-1],
                                        dataformats='HWC')

            # Hyperparameter burn-in
            # n_burn = nb - 1  # min(nb // 5 + 1, 1000)  # number of burn-in batches
            # if ni <= n_burn:
            #     for m in model.named_modules():
            #         if m[0].endswith('BatchNorm2d'):
            #             m[1].momentum = 1 - i / n_burn * 0.99  # BatchNorm2d momentum falls from 1 - 0.01
            #     g = (i / n_burn) ** 4  # gain rises from 0 - 1
            #     for x in optimizer.param_groups:
            #         x['lr'] = hyp['lr0'] * g
            #         x['weight_decay'] = hyp['weight_decay'] * g

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model)
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Scale loss by nominal batch_size of 64
            loss *= batch_size / 64

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()

            # Print batch results
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available(
            ) else 0  # (GB)
            s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1),
                                               '%.3gG' % mem, *mloss,
                                               len(targets), img_size)
            pbar.set_description(s)

            # end batch ------------------------------------------------------------------------------------------------

        # Update scheduler
        scheduler.step()

        # Process epoch results
        final_epoch = epoch + 1 == epochs
        if opt.prebias:
            print_model_biases(model)
        else:
            # Calculate mAP
            if not opt.notest or final_epoch:
                with torch.no_grad():
                    results, maps = test.test(
                        cfg,
                        data,
                        batch_size=batch_size,
                        img_size=opt.img_size,
                        model=model,
                        conf_thres=0.001
                        if final_epoch and epoch > 0 else 0.1,  # 0.1 for speed
                        save_json=final_epoch and epoch > 0
                        and 'coco.data' in data,
                        dataloader=testloader)

        # Write epoch results
        with open(results_file, 'a') as f:
            f.write(s + '%10.3g' * 7 % results +
                    '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
        if len(opt.name) and opt.bucket and not opt.prebias:
            os.system('gsutil cp results.txt gs://%s/results%s.txt' %
                      (opt.bucket, opt.name))

        # Write Tensorboard results
        if tb_writer:
            x = list(mloss) + list(results)
            titles = [
                'GIoU', 'Objectness', 'Classification', 'Train loss',
                'Precision', 'Recall', 'mAP', 'F1', 'val GIoU',
                'val Objectness', 'val Classification'
            ]
            for xi, title in zip(x, titles):
                tb_writer.add_scalar(title, xi, epoch)

        # Update best mAP
        fitness = sum(results[4:])  # total loss
        if fitness < best_fitness:
            best_fitness = fitness

        # Save training results
        save = (not opt.nosave) or (final_epoch
                                    and not opt.evolve) or opt.prebias
        if save:
            with open(results_file, 'r') as f:
                # Create checkpoint
                chkpt = {
                    'epoch':
                    epoch,
                    'best_fitness':
                    best_fitness,
                    'training_results':
                    f.read(),
                    'model':
                    model.module.state_dict()
                    if type(model) is nn.parallel.DistributedDataParallel else
                    model.state_dict(),
                    'optimizer':
                    None if final_epoch else optimizer.state_dict()
                }

            # Save last checkpoint
            torch.save(chkpt, last)

            # Save best checkpoint
            if best_fitness == fitness:
                torch.save(chkpt, best)

            # # Save backup every 10 epochs (optional)
            # if epoch > 0 and epoch % 10 == 0:
            #     torch.save(chkpt, wdir + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

        # end epoch ----------------------------------------------------------------------------------------------------

    # end training
    if len(opt.name) and not opt.prebias:
        fresults, flast, fbest = 'results%s.txt' % opt.name, 'last%s.pt' % opt.name, 'best%s.pt' % opt.name
        os.rename('results.txt', fresults)
        os.rename(wdir + 'last.pt', wdir +
                  flast) if os.path.exists(wdir + 'last.pt') else None
        os.rename(wdir + 'best.pt', wdir +
                  fbest) if os.path.exists(wdir + 'best.pt') else None

        # save to cloud
        if opt.bucket:
            os.system('gsutil cp %s %s gs://%s' %
                      (fresults, wdir + flast, opt.bucket))

    plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1,
                                                    (time.time() - t0) / 3600))
    dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()

    return results
Example #26
0
def main(arg_seed, arg_timestamp):
    random_seed = arg_seed
    np.random.seed(random_seed)
    random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True  # need to set to True as well

    print('Random Seed {}\n'.format(arg_seed))

    # -- training parameters
    num_epoch = args.epoch
    milestone = [50, 75]
    batch_size = args.batch
    num_workers = 2

    weight_decay = 1e-3
    gamma = 0.2
    current_delta = args.delta

    lr = args.lr
    start_epoch = 0

    # -- specify dataset
    # data augmentation
    transform_train = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    trainset = Animal10(split='train', transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
                                              worker_init_fn=_init_fn, drop_last=True)

    testset = Animal10(split='test', transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size * 4, shuffle=False, num_workers=num_workers)

    num_class = 10

    print('train data size:', len(trainset))
    print('test data size:', len(testset))

    # -- create log file
    if arg_timestamp:
        time_stamp = time.strftime("%Y%m%d-%H%M%S")
        file_name = 'Ours(' + time_stamp + ').txt'
    else:
        file_name = 'Ours.txt'

    log_dir = check_folder('logs')
    file_name = os.path.join(log_dir, file_name)
    saver = open(file_name, "w")

    saver.write(args.__repr__() + "\n\n")
    saver.flush()

    # -- set network, optimizer, scheduler, etc
    net = vgg19_bn(num_classes=num_class, pretrained=False)
    net = nn.DataParallel(net)

    optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=weight_decay)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = net.to(device)

    exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=milestone, gamma=gamma)
    criterion = torch.nn.CrossEntropyLoss()

    # -- misc
    iterations = 0
    f_record = torch.zeros([args.rollWindow, len(trainset), num_class])

    for epoch in range(start_epoch, num_epoch):
        train_correct = 0
        train_loss = 0
        train_total = 0

        net.train()

        for i, (images, labels, indices) in enumerate(trainloader):
            if images.size(0) == 1:  # when batch size equals 1, skip, due to batch normalization
                continue

            images, labels = images.to(device), labels.to(device)

            outputs = net(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_total += images.size(0)
            _, predicted = outputs.max(1)
            train_correct += predicted.eq(labels).sum().item()

            f_record[epoch % args.rollWindow, indices] = F.softmax(outputs.detach().cpu(), dim=1)

            iterations += 1
            if iterations % 100 == 0:
                cur_train_acc = train_correct / train_total * 100.
                cur_train_loss = train_loss / train_total
                cprint('epoch: {}\titerations: {}\tcurrent train accuracy: {:.4f}\ttrain loss:{:.4f}'.format(
                    epoch, iterations, cur_train_acc, cur_train_loss), 'yellow')

                if iterations % 5000 == 0:
                    saver.write('epoch: {}\titerations: {}\ttrain accuracy: {}\ttrain loss: {}\n'.format(
                        epoch, iterations, cur_train_acc, cur_train_loss))
                    saver.flush()

        train_acc = train_correct / train_total * 100.

        cprint('epoch: {}'.format(epoch), 'yellow')
        cprint('train accuracy: {:.4f}\ntrain loss: {:.4f}'.format(train_acc, train_loss), 'yellow')
        saver.write('epoch: {}\ntrain accuracy: {}\ntrain loss: {}\n'.format(epoch, train_acc, train_loss))
        saver.flush()

        exp_lr_scheduler.step()

        if epoch >= args.warm_up:
            f_x = f_record.mean(0)
            y_tilde = trainset.targets

            y_corrected, current_delta = lrt_correction(y_tilde, f_x, current_delta=current_delta, delta_increment=0.1)

            logging.info('Current delta:\t{}\n'.format(current_delta))

            trainset.update_corrupted_label(y_corrected)

        # testing
        net.eval()
        test_total = 0
        test_correct = 0
        with torch.no_grad():
            for i, (images, labels, _) in enumerate(testloader):
                images, labels = images.to(device), labels.to(device)

                outputs = net(images)

                test_total += images.size(0)
                _, predicted = outputs.max(1)
                test_correct += predicted.eq(labels).sum().item()

            test_acc = test_correct / test_total * 100.

        cprint('>> current test accuracy: {:.4f}'.format(test_acc), 'cyan')

        saver.write('>> current test accuracy: {}\n'.format(test_acc))
        saver.flush()

    saver.close()
Example #27
0
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()

    if not args.evaluate:
        sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'))
    else:
        sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt'), mode='a')
    print("==========\nArgs:{}\n==========".format(args))

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        cudnn.benchmark = True
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_imgreid_dataset(
        name=args.dataset,
        split_id=args.split_id,
        cuhk03_labeled=args.cuhk03_labeled,
        cuhk03_classic_split=args.cuhk03_classic_split,
    )

    transform_train = ST.Compose([
        ST.Scale((args.height, args.width), interpolation=3),
        ST.RandomHorizontalFlip(),
        ST.ToTensor(),
        ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ST.RandomErasing(0.5),
    ])

    transform_test = T.Compose([
        T.Resize((args.height, args.width), interpolation=3),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    pin_memory = True if use_gpu else False

    trainloader = DataLoader(
        ImageDataset_seg(dataset.train, transform=transform_train),
        sampler=RandomIdentitySampler(dataset.train,
                                      num_instances=args.num_instances),
        batch_size=args.train_batch,
        num_workers=args.workers,
        pin_memory=pin_memory,
        drop_last=True,
    )

    queryloader = DataLoader(
        ImageDataset(dataset.query, transform=transform_test),
        batch_size=args.test_batch,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=pin_memory,
        drop_last=False,
    )

    galleryloader = DataLoader(
        ImageDataset(dataset.gallery, transform=transform_test),
        batch_size=args.test_batch,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=pin_memory,
        drop_last=False,
    )

    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch,
                              num_classes=dataset.num_train_pids)
    print(model)
    print("Model size: {:.3f} M".format(count_num_param(model)))

    criterion_xent = CrossEntropyLabelSmooth(
        num_classes=dataset.num_train_pids, use_gpu=use_gpu)
    criterion_htri = TripletLoss(margin=args.margin, distance=args.distance)
    criterion_mask = MaskLoss(mode=args.mode)

    optimizer = init_optim(args.optim, model.parameters(), args.lr,
                           args.weight_decay)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=args.stepsize,
                                         gamma=args.gamma)

    if args.resume:
        if check_isfile(args.resume):
            checkpoint = torch.load(args.resume)
            model.load_state_dict(checkpoint['state_dict'])
            args.start_epoch = checkpoint['epoch']
            rank1 = checkpoint['rank1']
            print("Loaded checkpoint from '{}'".format(args.resume))
            print("- start_epoch: {}\n- rank1: {}".format(
                args.start_epoch, rank1))

    if use_gpu:
        model = nn.DataParallel(model).cuda()

    if args.evaluate:
        print("Evaluate only")
        test(model, queryloader, galleryloader, use_gpu)
        return

    start_time = time.time()
    train_time = 0
    best_rank1 = -np.inf
    best_epoch = 0
    print("==> Start training")

    for epoch in range(args.start_epoch, args.max_epoch):
        scheduler.step()

        start_train_time = time.time()
        train(epoch, model, criterion_xent, criterion_htri, criterion_mask,
              optimizer, trainloader)
        train_time += round(time.time() - start_train_time)

        if (epoch + 1) % args.eval_step == 0 or epoch == 0:
            print("==> Test")
            rank1 = test(model, queryloader, galleryloader, use_gpu)
            is_best = rank1 > best_rank1

            if is_best:
                best_rank1 = rank1
                best_epoch = epoch + 1

            if use_gpu:
                state_dict = model.module.state_dict()
            else:
                state_dict = model.state_dict()

            save_checkpoint(
                {
                    'state_dict': state_dict,
                    'rank1': rank1,
                    'epoch': epoch,
                }, is_best,
                osp.join(args.save_dir,
                         'checkpoint_ep' + str(epoch + 1) + '.pth.tar'))

    print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(
        best_rank1, best_epoch))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print(
        "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".
        format(elapsed, train_time))
    print("==========\nArgs:{}\n==========".format(args))
Example #28
0
def main():
    args = vars(parse_args_func())

    #config_file = "../configs/config_SN7.json"
    config_file = args['config']  # "../configs/config_v1.json"
    config_dict = json.loads(open(config_file, 'rt').read())
    #config_dict = json.loads(open(sys.argv[1], 'rt').read())

    file_dict = config_dict['file_path']
    config = config_dict['opt_config']

    input_folder = file_dict['input_path']  # '../inputs'
    checkpoint_folder = file_dict['checkpoint_path']  # '../checkpoint'
    model_folder = file_dict['model_path']  # '../models'

    if 'False' in config['deep_supervision']:
        config['deep_supervision'] = False
    else:
        config['deep_supervision'] = True

    if 'False' in config['nesterov']:
        config['nesterov'] = False
    else:
        config['nesterov'] = True

    if 'None' in config['name']:
        config['name'] = None

    if config['name'] is None:
        config['name'] = '%s_%s_segmodel' % (config['dataset'], config['arch'])
    os.makedirs(os.path.join(model_folder, '%s' % config['name']),
                exist_ok=True)

    if not os.path.isdir(checkpoint_folder):
        os.mkdir(checkpoint_folder)
    log_name = config['name']
    log_dir = os.path.join(checkpoint_folder, log_name)
    writer = SummaryWriter(logdir=log_dir)

    print('-' * 20)
    for key in config:
        print('%s: %s' % (key, config[key]))
    print('-' * 20)

    with open(os.path.join(model_folder, '%s/config.yml' % config['name']),
              'w') as f:
        yaml.dump(config, f)

    # define loss function (criterion)
    if config['loss'] == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda()
    else:
        criterion = losses.__dict__[config['loss']]().cuda()

    cudnn.benchmark = True

    # create model
    print("=> creating model %s" % config['arch'])
    model = archs.__dict__[config['arch']](config['num_classes'],
                                           config['input_channels'],
                                           config['deep_supervision'])

    if 'False' in config['resume']:
        config['resume'] = False
    else:
        config['resume'] = True
    resume_flag = False
    if resume_flag == True:
        save_path = os.path.join(model_folder, config['name'], 'model.pth')
        weights = torch.load(save_path)
        model.load_state_dict(weights)
        name_yaml = config['name']
        with open(os.path.join(model_folder, '%s/config.yml' % name_yaml),
                  'r') as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        #start_epoch = config['epochs']
        start_epoch = 0
    else:
        start_epoch = 0

    model = model.cuda()
    if 'effnet' in config['arch']:
        eff_flag = True
    else:
        eff_flag = False

    if eff_flag == True:
        cnn_subs = list(model.encoder.eff_conv.children())[1:]
        #cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
        #cnn_params = [item for sublist in cnn_params for item in sublist]

    summary(model,
            (config['input_channels'], config['input_w'], config['input_h']))
    params = filter(lambda p: p.requires_grad, model.parameters())
    if eff_flag == True:
        params = list(params) + list(model.encoder.conv_a.parameters())
    model = torch.nn.DataParallel(model)

    if config['optimizer'] == 'Adam':
        optimizer = optim.Adam(params,
                               lr=config['lr'],
                               weight_decay=config['weight_decay'])
    elif config['optimizer'] == 'SGD':
        optimizer = optim.SGD(params,
                              lr=config['lr'],
                              momentum=config['momentum'],
                              nesterov=config['nesterov'],
                              weight_decay=config['weight_decay'])
    else:
        raise NotImplementedError

    if eff_flag == True:
        cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
        cnn_params = [item for sublist in cnn_params for item in sublist]
        cnn_optimizer = torch.optim.Adam(cnn_params,
                                         lr=0.001,
                                         weight_decay=config['weight_decay'])
        #cnn_optimizer = None

    else:
        cnn_optimizer = None
    if config['optimizer'] == 'SGD':
        if config['scheduler'] == 'CosineAnnealingLR':
            scheduler = lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=config['epochs'], eta_min=config['min_lr'])
        elif config['scheduler'] == 'ReduceLROnPlateau':
            scheduler = lr_scheduler.ReduceLROnPlateau(
                optimizer,
                factor=config['factor'],
                patience=config['patience'],
                verbose=1,
                min_lr=config['min_lr'])
        elif config['scheduler'] == 'MultiStepLR':
            scheduler = lr_scheduler.MultiStepLR(
                optimizer,
                milestones=[int(e) for e in config['milestones'].split(',')],
                gamma=config['gamma'])
        elif config['scheduler'] == 'ConstantLR':
            scheduler = None
        else:
            raise NotImplementedError
    else:
        scheduler = None

    # Data loading code
    img_ids = glob(
        os.path.join(input_folder, config['dataset'], 'images', 'training',
                     '*' + config['img_ext']))
    train_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids]

    #img_dir = os.path.join(input_folder, config['dataset'], 'images', 'training')
    #mask_dir = os.path.join(input_folder, config['dataset'], 'annotations', 'training')
    #train_image_mask = image_to_afile(img_dir, mask_dir, None, train_img_ids, config)

    img_ids = glob(
        os.path.join(input_folder, config['val_dataset'], 'images',
                     'validation', '*' + config['img_ext']))
    val_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids]

    img_ids = glob(
        os.path.join(input_folder, config['val_dataset'], 'images', 'test',
                     '*' + config['img_ext']))
    test_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids]

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    train_transform = Compose([
        #transforms.RandomScale ([config['scale_min'], config['scale_max']]),
        #transforms.RandomRotate90(),
        transforms.Rotate([config['rotate_min'], config['rotate_max']],
                          value=mean,
                          mask_value=0),
        transforms.Flip(),
        #transforms.HorizontalFlip (),
        transforms.HueSaturationValue(hue_shift_limit=10,
                                      sat_shift_limit=10,
                                      val_shift_limit=10),
        transforms.RandomBrightnessContrast(brightness_limit=0.10,
                                            contrast_limit=0.10,
                                            brightness_by_max=True),
        transforms.Resize(config['input_h'], config['input_w']),
        transforms.Normalize(mean=mean, std=std),
    ])

    val_transform = Compose([
        transforms.Resize(config['input_h'], config['input_w']),
        transforms.Normalize(mean=mean, std=std),
    ])

    train_dataset = Dataset(img_ids=train_img_ids,
                            img_dir=os.path.join(input_folder,
                                                 config['dataset'], 'images',
                                                 'training'),
                            mask_dir=os.path.join(input_folder,
                                                  config['dataset'],
                                                  'annotations', 'training'),
                            img_ext=config['img_ext'],
                            mask_ext=config['mask_ext'],
                            num_classes=config['num_classes'],
                            input_channels=config['input_channels'],
                            transform=train_transform,
                            from_file=None)
    val_dataset = Dataset(img_ids=val_img_ids,
                          img_dir=os.path.join(input_folder,
                                               config['val_dataset'], 'images',
                                               'validation'),
                          mask_dir=os.path.join(input_folder,
                                                config['val_dataset'],
                                                'annotations', 'validation'),
                          img_ext=config['img_ext'],
                          mask_ext=config['mask_ext'],
                          num_classes=config['num_classes'],
                          input_channels=config['input_channels'],
                          transform=val_transform,
                          from_file=None)
    test_dataset = Dataset(img_ids=test_img_ids,
                           img_dir=os.path.join(input_folder,
                                                config['val_dataset'],
                                                'images', 'test'),
                           mask_dir=os.path.join(input_folder,
                                                 config['val_dataset'],
                                                 'annotations', 'test'),
                           img_ext=config['img_ext'],
                           mask_ext=config['mask_ext'],
                           num_classes=config['num_classes'],
                           input_channels=config['input_channels'],
                           transform=val_transform,
                           from_file=None)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=config['num_workers'],
        drop_last=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=1,  #config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        drop_last=False)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=1,  #config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        drop_last=False)

    log = OrderedDict([
        ('epoch', []),
        ('lr', []),
        ('loss', []),
        ('iou', []),
        ('dice', []),
        ('val_loss', []),
        ('val_iou', []),
        ('val_dice', []),
    ])

    best_iou = 0
    trigger = 0
    Best_dice = 0
    iou_AtBestDice = 0
    for epoch in range(start_epoch, config['epochs']):
        print('{:s} Epoch [{:d}/{:d}]'.format(config['arch'], epoch,
                                              config['epochs']))
        # train for one epoch
        train_log = train(epoch, config, train_loader, model, criterion,
                          optimizer, cnn_optimizer)
        if config['optimizer'] == 'SGD':
            if config['scheduler'] == 'CosineAnnealingLR':
                scheduler.step()
            elif config['scheduler'] == 'ReduceLROnPlateau':
                scheduler.step(val_log['loss'])
            elif config['scheduler'] == 'MultiStepLR':
                scheduler.step()

        # evaluate on validation set
        val_log = validate(config, val_loader, model, criterion)
        test_log = validate(config, test_loader, model, criterion)

        if Best_dice < test_log['dice']:
            Best_dice = test_log['dice']
            iou_AtBestDice = test_log['iou']
        print(
            'loss %.4f - iou %.4f - dice %.4f - val_loss %.4f - val_iou %.4f - val_dice %.4f - test_iou %.4f - test_dice %.4f - Best_dice %.4f - iou_AtBestDice %.4f'
            % (train_log['loss'], train_log['iou'], train_log['dice'],
               val_log['loss'], val_log['iou'], val_log['dice'],
               test_log['iou'], test_log['dice'], Best_dice, iou_AtBestDice))

        save_tensorboard(writer, train_log, val_log, test_log, epoch)
        log['epoch'].append(epoch)
        log['lr'].append(config['lr'])
        log['loss'].append(train_log['loss'])
        log['iou'].append(train_log['iou'])
        log['dice'].append(train_log['dice'])
        log['val_loss'].append(val_log['loss'])
        log['val_iou'].append(val_log['iou'])
        log['val_dice'].append(val_log['dice'])

        pd.DataFrame(log).to_csv(os.path.join(model_folder,
                                              '%s/log.csv' % config['name']),
                                 index=False)

        trigger += 1

        if val_log['iou'] > best_iou:
            torch.save(
                model.state_dict(),
                os.path.join(model_folder, '%s/model.pth' % config['name']))
            best_iou = val_log['iou']
            print("=> saved best model")
            trigger = 0

        # early stopping
        if config['early_stopping'] >= 0 and trigger >= config[
                'early_stopping']:
            print("=> early stopping")
            break

        torch.cuda.empty_cache()
genusnet.to(device)
discriminator_family.to(device)
familynet.to(device)

loss_fn = torch.nn.CrossEntropyLoss()
loss_fn2 = ContrastiveLoss()  ##quitar
loss_fn3 = SpecLoss()  ##quitar
# -----------------------------------------------------------------------------
## etapa 1: entrenar g y h
print("||||| Stage 1 |||||")
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(classifier.parameters()) +
    list(ssnet.parameters()) + list(genusnet.parameters()) +
    list(familynet.parameters()),
    lr=0.0001)
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[30], gamma=0.1)

#herb std-mean
#tensor([0.0808, 0.0895, 0.1141])
#tensor([0.7410, 0.7141, 0.6500])
#photo std-mean
#tensor([0.1399, 0.1464, 0.1392])
#tensor([0.2974, 0.3233, 0.2370])

data_transforms = {
    'train':
    transforms.Compose([
        #transforms.Resize((img_size, img_size)),
        transforms.RandomRotation(15),
        #transforms.RandomCrop((img_size, img_size)),
        #transforms.RandomResizedCrop((img_size, img_size)),
Example #30
0
                                   lr=config['learning_rate'],
                                   weight_decay=0,
                                   betas=(0.9, 0.999))
    init_weights(netG, init_type='kaiming', scale=0.1)
    global_step = 0

    # G pixel loss
    cri_pix = nn.L1Loss().to(device)
    # G feature loss
    cri_fea = nn.L1Loss().to(device)
    # load VGG perceptual loss
    netF = VGGFeatureExtractor(feature_layer=34, use_bn=False).to(device)
    print('# perceptual parameters:',
          sum(param.numel() for param in netF.parameters()))

    scheduler = lr_scheduler.MultiStepLR(optimizer_G,
                                         [50000, 100000, 200000, 300000], 0.5)

    log_dict = OrderedDict()
    netG.train()
    for epoch in trange(config['number_epochs']):
        train_bar = tqdm(train_loader)
        train_bar.set_description_str(desc=f"N epochs - {epoch}")

        scheduler.step()

        for step, (lr, hr) in enumerate(train_bar):
            global_step += 1

            lr = torch.autograd.Variable(lr, requires_grad=True).to(device)
            hr = torch.autograd.Variable(hr, requires_grad=True).to(device)
            sr = netG(lr)