Example #1
0
def train(config, generator, discriminator, kp_detector, checkpoint, log_dir,
          dataset, device_ids):
    train_params = config['train_params']

    optimizer_generator = torch.optim.Adam(generator.parameters(),
                                           lr=train_params['lr_generator'],
                                           betas=(0.5, 0.999))
    optimizer_discriminator = torch.optim.Adam(
        discriminator.parameters(),
        lr=train_params['lr_discriminator'],
        betas=(0.5, 0.999))
    optimizer_kp_detector = torch.optim.Adam(kp_detector.parameters(),
                                             lr=train_params['lr_kp_detector'],
                                             betas=(0.5, 0.999))

    if checkpoint is not None:
        start_epoch = Logger.load_cpk(
            checkpoint, generator, discriminator, kp_detector,
            optimizer_generator, optimizer_discriminator, None
            if train_params['lr_kp_detector'] == 0 else optimizer_kp_detector)
    else:
        start_epoch = 0

    scheduler_generator = MultiStepLR(optimizer_generator,
                                      train_params['epoch_milestones'],
                                      gamma=0.1,
                                      last_epoch=start_epoch - 1)
    scheduler_discriminator = MultiStepLR(optimizer_discriminator,
                                          train_params['epoch_milestones'],
                                          gamma=0.1,
                                          last_epoch=start_epoch - 1)
    scheduler_kp_detector = MultiStepLR(optimizer_kp_detector,
                                        train_params['epoch_milestones'],
                                        gamma=0.1,
                                        last_epoch=-1 + start_epoch *
                                        (train_params['lr_kp_detector'] != 0))

    if 'num_repeats' in train_params or train_params['num_repeats'] != 1:
        dataset = DatasetRepeater(dataset, train_params['num_repeats'])
    dataloader = DataLoader(dataset,
                            batch_size=train_params['batch_size'],
                            shuffle=True,
                            num_workers=6,
                            drop_last=True)

    generator_full = GeneratorFullModel(kp_detector, generator, discriminator,
                                        train_params)
    discriminator_full = DiscriminatorFullModel(kp_detector, generator,
                                                discriminator, train_params)

    if torch.cuda.is_available():
        generator_full = DataParallelWithCallback(generator_full,
                                                  device_ids=device_ids)
        discriminator_full = DataParallelWithCallback(discriminator_full,
                                                      device_ids=device_ids)

    with Logger(log_dir=log_dir,
                visualizer_params=config['visualizer_params'],
                checkpoint_freq=train_params['checkpoint_freq']) as logger:
        for epoch in trange(start_epoch, train_params['num_epochs']):
            for x in dataloader:
                losses_generator, generated = generator_full(x)

                loss_values = [val.mean() for val in losses_generator.values()]
                loss = sum(loss_values)

                loss.backward()
                optimizer_generator.step()
                optimizer_generator.zero_grad()
                optimizer_kp_detector.step()
                optimizer_kp_detector.zero_grad()

                if train_params['loss_weights']['generator_gan'] != 0:
                    optimizer_discriminator.zero_grad()
                    losses_discriminator = discriminator_full(x, generated)
                    loss_values = [
                        val.mean() for val in losses_discriminator.values()
                    ]
                    loss = sum(loss_values)

                    loss.backward()
                    optimizer_discriminator.step()
                    optimizer_discriminator.zero_grad()
                else:
                    losses_discriminator = {}

                losses_generator.update(losses_discriminator)
                losses = {
                    key: value.mean().detach().data.cpu().numpy()
                    for key, value in losses_generator.items()
                }
                logger.log_iter(losses=losses)

            scheduler_generator.step()
            scheduler_discriminator.step()
            scheduler_kp_detector.step()

            logger.log_epoch(epoch, {
                'generator': generator,
                'discriminator': discriminator,
                'kp_detector': kp_detector,
                'optimizer_generator': optimizer_generator,
                'optimizer_discriminator': optimizer_discriminator,
                'optimizer_kp_detector': optimizer_kp_detector
            },
                             inp=x,
                             out=generated)
Example #2
0
def run(args):
    if not os.path.exists('outputs'):
        os.mkdir('outputs')

    # Select the optimization criterion/task
    if args.loss == 'CE':
        # Classification
        LearnerClass = Learner_Classification
        criterion = nn.CrossEntropyLoss()
    elif args.loss in ['KCL', 'CCL']:
        # Clustering
        LearnerClass = Learner_Clustering
        criterion = modules.criterion.__dict__[args.loss]()
    elif args.loss == 'DPS':
        # Dense-Pair Similarity Learning
        LearnerClass = Learner_DensePairSimilarity
        criterion = nn.CrossEntropyLoss()
        args.out_dim = 2  # force it

    # Prepare dataloaders
    train_loader, eval_loader = dataloaders.default.__dict__[args.dataset](
        args.batch_size, args.workers)

    # Prepare the model
    if args.out_dim < 0:  # Use ground-truth number of classes/clusters
        args.out_dim = train_loader.num_classes
    model = LearnerClass.create_model(args.model_type, args.model_name,
                                      args.out_dim)

    # Load pre-trained model
    if args.pretrained_model != '':  # Load model weights only
        print('=> Load model weights:', args.pretrained_model)
        model_state = torch.load(args.pretrained_model,
                                 map_location=lambda storage, loc: storage
                                 )  # Load to CPU as the default!
        model.load_state_dict(model_state, strict=args.strict)
        print('=> Load Done')

    # Load the pre-trained Similarity Prediction Network (SPN, or the G function in paper)
    if args.use_SPN:
        # To load a custom SPN, you can modify here.
        SPN = Learner_DensePairSimilarity.create_model(args.SPN_model_type,
                                                       args.SPN_model_name, 2)
        print('=> Load SPN model weights:', args.SPN_pretrained_model)
        SPN_state = torch.load(args.SPN_pretrained_model,
                               map_location=lambda storage, loc: storage
                               )  # Load to CPU as the default!
        SPN.load_state_dict(SPN_state)
        print('=> Load SPN Done')
        print('SPN model:', SPN)
        #SPN.eval()  # Tips: Stay in train mode, so the BN layers of SPN adapt to the new domain
        args.SPN = SPN  # It will be used in prepare_task_target()

    # GPU
    if args.use_gpu:
        torch.cuda.set_device(args.gpuid[0])
        cudnn.benchmark = True  # make it train faster
        model = model.cuda()
        criterion = criterion.cuda()
        if args.SPN is not None:
            args.SPN = args.SPN.cuda()

    # Multi-GPU
    if len(args.gpuid) > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=args.gpuid,
                                      output_device=args.gpuid[0])

    print('Main model:', model)
    print('Criterion:', criterion)

    # Evaluation Only
    if args.skip_train:
        cudnn.benchmark = False  # save warm-up time
        eval_loader = eval_loader if eval_loader is not None else train_loader
        KPI = evaluate(eval_loader, model, args)
        return KPI

    # Prepare the learner
    optim_args = {'lr': args.lr}
    if args.optimizer == 'SGD':
        optim_args['momentum'] = 0.9
    optimizer = torch.optim.__dict__[args.optimizer](model.parameters(),
                                                     **optim_args)
    scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=0.1)
    learner = LearnerClass(model, criterion, optimizer, scheduler)

    # Start optimization
    if args.resume:
        args.start_epoch = learner.resume(
            args.resume) + 1  # Start from next epoch
    for epoch in range(args.start_epoch, args.epochs):
        train(epoch, train_loader, learner, args)
        if eval_loader is not None and ((not args.skip_eval) or
                                        (epoch == args.epochs - 1)):
            KPI = evaluate(eval_loader, model, args)
            # Save checkpoint at each LR steps and the end of optimization
            if epoch + 1 in args.schedule + [args.epochs]:
                learner.snapshot(
                    "outputs/%s_%s_%s" %
                    (args.dataset, args.model_name, args.saveid), KPI)
    return KPI
Example #3
0
def main():
    args.cfg = v2
    args.train_sets = 'train'
    args.means = (104, 117, 123)
    num_classes = len(CLASSES) + 1
    args.num_classes = num_classes
    args.stepvalues = [int(val) for val in args.stepvalues.split(',')]
    args.loss_reset_step = 30
    args.eval_step = 10000
    args.print_step = 10

    ## Define the experiment Name will used to same directory and ENV for visdom
    args.exp_name = 'CONV-SSD-{}-{}-bs-{}-{}-lr-{:05d}'.format(
        args.dataset, args.input_type, args.batch_size, args.basenet[:-14],
        int(args.lr * 100000))

    args.save_root += args.dataset + '/'
    args.save_root = args.save_root + 'cache/' + args.exp_name + '/'

    if not os.path.isdir(args.save_root):
        os.makedirs(args.save_root)

    net = build_ssd(300, args.num_classes)

    if args.cuda:
        net = net.cuda()

    def xavier(param):
        init.xavier_uniform(param)

    def weights_init(m):
        if isinstance(m, nn.Conv2d):
            xavier(m.weight.data)
            m.bias.data.zero_()

    print('Initializing weights for extra layers and HEADs...')
    # initialize newly added layers' weights with xavier method
    net.extras.apply(weights_init)
    net.loc.apply(weights_init)
    net.conf.apply(weights_init)

    if args.input_type == 'fastOF':
        print(
            'Download pretrained brox flow trained model weights and place them at:::=> ',
            args.data_root + 'ucf24/train_data/brox_wieghts.pth')
        pretrained_weights = args.data_root + 'ucf24/train_data/brox_wieghts.pth'
        print('Loading base network...')
        net.load_state_dict(torch.load(pretrained_weights))
    else:
        vgg_weights = torch.load(args.data_root + 'ucf24/train_data/' +
                                 args.basenet)
        print('Loading base network...')
        net.vgg.load_state_dict(vgg_weights)

    args.data_root += args.dataset + '/'

    parameter_dict = dict(net.named_parameters(
    ))  # Get parmeter of network in dictionary format wtih name being key
    params = []

    #Set different learning rate to bias layers and set their weight_decay to 0
    for name, param in parameter_dict.items():
        if name.find('bias') > -1:
            print(name,
                  'layer parameters will be trained @ {}'.format(args.lr * 2))
            params += [{
                'params': [param],
                'lr': args.lr * 2,
                'weight_decay': 0
            }]
        else:
            print(name,
                  'layer parameters will be trained @ {}'.format(args.lr))
            params += [{
                'params': [param],
                'lr': args.lr,
                'weight_decay': args.weight_decay
            }]

    optimizer = optim.SGD(params,
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    criterion = MultiBoxLoss(args.num_classes, 0.5, True, 0, True, 3, 0.5,
                             False, args.cuda)
    scheduler = MultiStepLR(optimizer,
                            milestones=args.stepvalues,
                            gamma=args.gamma)
    train(args, net, optimizer, criterion, scheduler)
Example #4
0
        save_dir=save_dir)  # load the last model in matconvnet style
    if initial_epoch > 0:
        print('resuming by loading epoch %03d' % initial_epoch)
        # model.load_state_dict(torch.load(os.path.join(save_dir, 'model_%03d.pth' % initial_epoch)))
        model = torch.load(
            os.path.join(save_dir, 'model_%03d.pth' % initial_epoch))
    model.train()
    criterion = nn.MSELoss(reduction='sum')  # PyTorch 0.4.1
    # criterion = sum_squared_error()
    if cuda:
        model = model.cuda()
        # device_ids = [0]
        # model = nn.DataParallel(model, device_ids=device_ids).cuda()
        # criterion = criterion.cuda()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = MultiStepLR(optimizer, milestones=[30, 60, 90],
                            gamma=0.2)  # learning rates
    for epoch in range(initial_epoch, n_epoch):

        scheduler.step(epoch)  # step to the learning rate in this epcoh
        xs = dg.datagenerator(data_dir=args.train_data)
        xs = xs.astype('float32') / 255.0
        xs = torch.from_numpy(xs.transpose(
            (0, 3, 1, 2)))  # tensor of the clean patches, NXCXHXW
        DDataset = DenoisingDataset(xs, sigma)
        DLoader = DataLoader(dataset=DDataset,
                             num_workers=0,
                             drop_last=True,
                             batch_size=batch_size,
                             shuffle=True)
        epoch_loss = 0
        start_time = time.time()
    idx_tensor = [idx for idx in range(67)]
    idx_tensor = Variable(torch.FloatTensor(idx_tensor)).cuda(gpu)

    optimizer = torch.optim.Adam([{
        'params': get_ignored_params(model),
        'lr': 0
    }, {
        'params': get_non_ignored_params(model),
        'lr': args.lr
    }, {
        'params': get_fc_params(model),
        'lr': args.lr * 5
    }],
                                 lr=args.lr)

    lr_scheduler = MultiStepLR(optimizer, [30, 80], gamma=0.1, last_epoch=-1)

    print('Ready to train network.')
    for epoch in range(num_epochs):
        lr_scheduler.step()

        for i, (images, labels, cont_labels, name) in enumerate(train_loader):
            images = Variable(images).cuda(gpu)

            # Binned labels
            label_yaw = Variable(labels[:, 0]).cuda(gpu)
            label_pitch = Variable(labels[:, 1]).cuda(gpu)
            label_roll = Variable(labels[:, 2]).cuda(gpu)

            # Continuous labels
            label_yaw_cont = Variable(cont_labels[:, 0]).cuda(gpu)
Example #6
0
def train(args, io):
    train_loader = DataLoader(ModelNetNormal(args.num_points,
                                             partition='train'),
                              num_workers=8,
                              batch_size=args.batch_size,
                              shuffle=True,
                              drop_last=True)
    test_loader = DataLoader(ModelNetNormal(args.num_points, partition='test'),
                             num_workers=8,
                             batch_size=args.test_batch_size,
                             shuffle=False,
                             drop_last=False)

    device = torch.device("cuda" if args.cuda else "cpu")

    # create model
    model = CurveNet(args.multiplier).to(device)
    model = nn.DataParallel(model)
    io.cprint("Let's use" + str(torch.cuda.device_count()) + "GPUs!")

    if args.use_sgd:
        io.cprint("Use SGD")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr * 100,
                        momentum=args.momentum,
                        weight_decay=1e-4)
    else:
        io.cprint("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    if args.scheduler == 'cos':
        scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=1e-3)
    elif args.scheduler == 'step':
        scheduler = MultiStepLR(opt, [140, 180], gamma=0.1)

    criterion = torch.nn.CosineEmbeddingLoss()

    best_test_loss = 99
    for epoch in range(args.epochs):
        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        model.train()
        for data, seg in train_loader:
            data, seg = data.to(device), seg.to(device)
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            opt.zero_grad()
            seg_pred = model(data)
            seg_pred = seg_pred.permute(0, 2, 1).contiguous()
            #print(seg_pred.shape, seg.shape)
            loss = criterion(seg_pred.view(-1, 3),
                             seg.view(-1, 3).squeeze(),
                             torch.tensor(1).cuda())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            opt.step()
            count += batch_size
            train_loss += loss.item() * batch_size

        if args.scheduler == 'cos':
            scheduler.step()
        elif args.scheduler == 'step':
            if opt.param_groups[0]['lr'] > 1e-5:
                scheduler.step()
            if opt.param_groups[0]['lr'] < 1e-5:
                for param_group in opt.param_groups:
                    param_group['lr'] = 1e-5

        outstr = 'Train %d, loss: %.6f' % (epoch, train_loss / count)
        io.cprint(outstr)

        ####################
        # Test
        ####################
        test_loss = 0.0
        count = 0.0
        model.eval()
        for data, seg in test_loader:
            data, seg = data.to(device), seg.to(device)
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            seg_pred = model(data)
            seg_pred = seg_pred.permute(0, 2, 1).contiguous()

            loss = criterion(seg_pred.view(-1, 3),
                             seg.view(-1, 3).squeeze(),
                             torch.tensor(1).cuda())
            count += batch_size
            test_loss += loss.item() * batch_size

        if test_loss * 1.0 / count <= best_test_loss:
            best_test_loss = test_loss * 1.0 / count
            torch.save(model.state_dict(),
                       '../checkpoints/%s/models/model.t7' % args.exp_name)
        outstr = 'Test %d, loss: %.6f, best loss %.6f' % (
            epoch, test_loss / count, best_test_loss)
        io.cprint(outstr)
Example #7
0
def train(args,
          dataloader_train,
          model,
          feature_map,
          dataloader_validate=None):
    # initialize optimizer
    optimizer = {}
    for name, net in model.items():
        optimizer['optimizer_' + name] = optim.Adam(filter(
            lambda p: p.requires_grad, net.parameters()),
                                                    lr=args.lr,
                                                    weight_decay=5e-5)

    scheduler = {}
    for name, net in model.items():
        scheduler['scheduler_' + name] = MultiStepLR(
            optimizer['optimizer_' + name],
            milestones=args.milestones,
            gamma=args.gamma)

    if args.load_model:
        load_model(args.load_model_path, args.device, model, optimizer,
                   scheduler)
        print('Model loaded')

        epoch = get_model_attribute('epoch', args.load_model_path, args.device)
    else:
        epoch = 0

    if args.log_tensorboard:
        writer = SummaryWriter(log_dir=args.tensorboard_path + args.fname +
                               ' ' + args.time,
                               flush_secs=5)
    else:
        writer = None

    while epoch < args.epochs:
        loss, acc = train_epoch(epoch, args, model, dataloader_train,
                                optimizer, scheduler, feature_map, writer)
        epoch += 1
        print('Epoch: {}/{}, train loss: {:.3f}, accuray: {:.3f}'.format(
            epoch, args.epochs, loss, acc))

        # logging
        if args.log_tensorboard:
            writer.add_scalar(
                '{} {} Loss/train'.format(args.note, args.graph_type), loss,
                epoch)

        # save model checkpoint
        if args.save_model and epoch != 0 and epoch % args.epochs_save == 0:
            save_model(epoch,
                       args,
                       model,
                       optimizer,
                       scheduler,
                       feature_map=feature_map)
            print('Model Saved - Epoch: {}/{}, train loss: {:.6f}'.format(
                epoch, args.epochs, loss))

        if dataloader_validate is not None and epoch % args.epochs_validate == 0:
            loss_validate = test_data(args, model, dataloader_validate,
                                      feature_map)
            if args.log_tensorboard:
                writer.add_scalar(
                    '{} {} Loss/validate'.format(args.note, args.graph_type),
                    loss_validate, epoch)
            else:
                print('Epoch: {}/{}, validation loss: {:.6f}'.format(
                    epoch, args.epochs, loss_validate))

    save_model(epoch,
               args,
               model,
               optimizer,
               scheduler,
               feature_map=feature_map)
    print('Model Saved - Epoch: {}/{}, train loss: {:.6f}'.format(
        epoch, args.epochs, loss))
Example #8
0
 def test_multi_step_lr_state_dict(self):
     self._check_scheduler_state_dict(
         lambda: MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9]),
         lambda: MultiStepLR(self.opt, gamma=0.01, milestones=[1, 4, 6]))
Example #9
0
def run_training():
    """
    Run the cross validation given the data, x_data, y, the shufflesplit object, the
    modified model, number of epochs and whether to use the gpu. Specifically
    the model is trained for the number epochs per cross validation set as well as
    tested. The accuracies and models are saved.
    """
    print("\nRunning with model: {} {}\n".format(args.model_name,
                                                 args.train_config))

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # create the directory of the model type in case it does not exist
    try:
        os.makedirs(args.fp_save)
        print("Creating directory: ", args.fp_save)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    #--------------------------------------------------------------------------------------------------------------#
    # Data loading and preprocessing

    filenames_allfiles, y = get_data()

    # # get only filenames from tuesday, as only day with every sample ID
    # filenames_allfiles_tuesday = np.array([name for name in glob.glob(args.fp_data + "2/*")])
    # y_tuesday = []
    # for fname in filenames_allfiles_tuesday:
    #     fname = fname.split(".JPEG")[0].split("/")[-1].replace("_", ",")
    #     y_tuesday.append(get_dai_label(fname))
    # y_tuesday = np.array(y_tuesday)
    # y_tuesday[y_tuesday > 0] = 1
    #
    # # create shufflesplit object, based only on tuesday first
    # rs = StratifiedShuffleSplit(n_splits=args.n_cvruns, test_size=0.25, random_state=args.seed)
    #
    # cv_splits = [[train_index, test_index] for train_index, test_index in rs.split(filenames_allfiles_tuesday, y_tuesday)]
    #
    # # get the desired cv split
    # train_index, test_index = cv_splits[args.cv_run]
    #
    # # now put all filenames of the plant ids previously put into train and test also into overall train and test
    # # i.e. plant Z1_0_0_0 from train_index will be in train_index_all regardless of the day, so 1-5
    # train_index_all = []
    # test_index_all = []
    # for idx in train_index:
    #     fname = filenames_allfiles_tuesday[idx]
    #     plant_id = fname.split('/2_')[-1]
    #     train_index_all += [i for i in range(len(filenames_allfiles)) if plant_id in filenames_allfiles[i]]
    # for idx in test_index:
    #     fname = filenames_allfiles_tuesday[idx]
    #     plant_id = fname.split('/2_')[-1]
    #     test_index_all += [i for i in range(len(filenames_allfiles)) if plant_id in filenames_allfiles[i]]
    #
    # train_index = train_index_all
    # test_index = test_index_all
    #
    # train_samples = [fname.split('/')[-1].split('.')[0] for fname in filenames_allfiles[train_index]]
    # test_samples = [fname.split('/')[-1].split('.')[0] for fname in filenames_allfiles[test_index]]

    # load and permutate data
    # load data and permutate
    if not args.rrr:
        filenames_allfiles, x_data, y, perm_ids = imread_from_fp_rescale_rotate_flatten(
            fplist=filenames_allfiles,
            y=y,
            rescale_size=224,
            n_rot_per_img=args.n_rotations)
    else:
        print("Loading masks as well ...")
        filenames_allfiles, x_data, masks, y, perm_ids = imread_from_fp_rescale_rotate_flatten_returnmasks(
            fplist=filenames_allfiles,
            fp_mask=args.fp_mask,
            y=y,
            rescale_size=224,
            n_rot_per_img=args.n_rotations,
            rrr=args.rrr,
            model=args.model_name)

    # get split from corresponding cv run, based on permutated filenames_allfiles
    train_index, test_index, train_samples, test_samples = get_data_split(
        filenames_allfiles)

    print("Images read")
    print("Size of data:" + str(sys.getsizeof(x_data) * 1e-9) + " GB")

    print("\n----------------------------------------------------------------")
    print("Model: {}".format(args.model_name))
    print("Cross validation round: " + str(args.cv_run))

    print("StandardScaler being fit using training data...")
    std_scaler = preprocessing.StandardScaler()
    std_scaler = std_scaler.fit(x_data[train_index, :])

    print("StandardScaler transform being applied ...")
    x_norm = reshape_flattened_to_tensor_rgb(std_scaler.transform(x_data),
                                             width_height=224)

    print("Data reshaped ...")

    #--------------------------------------------------------------------------------------------------------------#
    # Loading model

    # load pretrained model
    print("Loading the pretrained torchvision model.")
    model, _, model_params = load_model(num_classes=args.num_classes,
                                        feature_extract=False,
                                        use_pretrained=True)
    model.target_output = None

    optimizer = torch.optim.Adam(model_params,
                                 lr=args.lr,
                                 amsgrad=True,
                                 weight_decay=1e-5)

    # set optimizer and criterion
    weights = [
        np.sum(y[train_index] == i) / len(y[train_index])
        for i in np.arange(0, args.num_classes)
    ]
    class_weights = torch.FloatTensor(weights)
    if args.cuda:
        class_weights = class_weights.cuda()

    if args.rrr:
        print("Using NLLLoss")
        criterion_train = nn.NLLLoss(weight=class_weights).cuda()
        criterion_test = nn.CrossEntropyLoss(weight=class_weights)
    else:
        print("Using CrossEntropyLoss")
        criterion_train = nn.CrossEntropyLoss(weight=class_weights)
        criterion_test = nn.CrossEntropyLoss(weight=class_weights)

    if not args.rrr:
        scheduler = MultiStepLR(optimizer,
                                milestones=[5, 15],
                                gamma=0.1,
                                last_epoch=-1)

    if args.cuda:
        model.cuda()

    print('Model successfully loaded ...')

    #--------------------------------------------------------------------------------------------------------------#
    # Setting up Dataloaders

    # create tensors for dataloaders
    train_tensors = (torch.tensor(x_norm[train_index, :, :, :]),
                     torch.tensor(y[train_index]))
    test_tensors = (torch.tensor(x_norm[test_index, :, :, :]),
                    torch.tensor(y[test_index]))

    print("Tensors created")

    # if required add masks
    if args.rrr:
        masks = np.reshape(masks, (masks.shape[0], 14, 14))
        train_tensors = train_tensors + (torch.tensor(masks[train_index]), )
        test_tensors = test_tensors + (torch.tensor(masks[test_index]), )

    train_dataset = CustomRGBDataset(tensors=train_tensors)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size)
    test_dataset = CustomRGBDataset(tensors=test_tensors)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.batch_size)

    print('Data successfully loaded ...')

    #--------------------------------------------------------------------------------------------------------------#
    # Training process

    print('Training beginning ...')

    last_checkpoint_fp = 0
    last_best_checkpoint_fp = 0

    last_test_acc = -10

    raloss_array = np.ones(args.n_epochs)
    rrloss_array = np.ones(args.n_epochs)
    bal_acc_train_array = np.ones(args.n_epochs)
    bal_acc_test_array = np.ones(args.n_epochs)

    # train on number of epochs
    for epoch in np.arange(1, args.n_epochs + 1):
        with torch.set_grad_enabled(True):
            epoch_train_loss, epoch_train_acc, epoch_ra_loss, epoch_rr_loss = train(
                model,
                train_loader,
                criterion_train,
                class_weights=class_weights,
                optimizer=optimizer,
                verbose=0)
            print(
                "Epoch: {} Train Loss : {:.4f}  Train Accuracy: {:.4f} "
                "Train Right Answer Loss: {:.4f} Train Right Reason Loss: {:.4f}"
                .format(epoch, epoch_train_loss, epoch_train_acc,
                        epoch_ra_loss, epoch_rr_loss))
            raloss_array[epoch - 1] = epoch_ra_loss
            rrloss_array[epoch - 1] = epoch_rr_loss
            bal_acc_train_array[epoch - 1] = epoch_train_acc

        with torch.set_grad_enabled(False):
            print("Evaluation running...")
            epoch_test_loss, epoch_test_acc = test(model,
                                                   test_loader,
                                                   criterion_test,
                                                   verbose=0)
            print("Epoch: {} Test Loss : {:.4f}  Test Accuracy: {:.4f}".format(
                epoch, epoch_test_loss, epoch_test_acc))

            bal_acc_test_array[epoch - 1] = epoch_test_acc

            if not args.rrr:
                print('Epoch:', epoch, 'LR:', scheduler.get_lr())
                scheduler.step()

            # save current model
            checkpoint = {
                'model_state': model.state_dict(),
                'std_scaler': std_scaler,
                'optimizer_state': optimizer.state_dict(),
                'args': args,
                'epoch': epoch,
                'test_loss': epoch_test_loss,
                'test_acc': epoch_test_acc,
            }

            if epoch > 1:
                # remove checkpoint saved in last epoch
                os.remove(last_checkpoint_fp)

            last_checkpoint_fp = "{}{}_cvnum_{}_epoch_{}_evalbalacc_{}_trainraloss_" \
                                 "{}_trainrrrloss_{}.pth".format(args.fp_save, args.model_name,
                                                                             str(args.cv_run), str(epoch),
                                                                             str(int(1e+4 * round(epoch_test_acc, 4))),
                                                                             str(round(epoch_ra_loss, 4)),
                                                                             str(round(epoch_rr_loss, 4)))

            torch.save(checkpoint, last_checkpoint_fp)

            # if the test acc is higher than any model before store this model seperately
            if epoch_test_acc > last_test_acc and not args.rrr:
                if epoch > 1:
                    # delete last best model
                    os.remove(last_best_checkpoint_fp)

                last_best_checkpoint_fp = "{}{}_cvnum_{}_epoch_{}_evalbalacc_{}_trainraloss_" \
                                          "{}_trainrrrloss_{}_besttestacc.pth".format(args.fp_save, args.model_name,
                                                                     str(args.cv_run), str(epoch),
                                                                     str(int(1e+4 * round(epoch_test_acc, 4))),
                                                                     str(round(epoch_ra_loss, 4)),
                                                                     str(round(epoch_rr_loss, 4)))

                torch.save(checkpoint, last_best_checkpoint_fp)
                last_test_acc = epoch_test_acc
                print("Saving new best model...")

    print("\nTraining " + args.model_name +
          " finished\n-------------------------------------------\n")
Example #10
0
def main(args):
    global best_metric_value

    train_transforms = transforms.Compose([
        transforms.Resize(512, minside=False),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10, resample=Image.BILINEAR),
        transforms.PadToSize(480),
        transforms.RandomResizedCrop(480, scale=(0.5, 2), ratio=(1, 1)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    val_transforms = transforms.Compose([
        transforms.Resize(512, minside=False),
        transforms.PadToSize(480),
        transforms.CenterCrop(480),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    # Setup Dataset and Dataloader
    data_loader = get_loader(args.dataset)
    data_path = get_data_path(args.dataset)
    # Train
    train_dataset = data_loader(data_path, transform=train_transforms)
    args.n_classes = train_dataset.n_classes
    trainloader = data.DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers,
                                  shuffle=True,
                                  pin_memory=True)
    # Validation
    val_dataset = data_loader(data_path, split='val', transform=val_transforms)
    valloader = data.DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                num_workers=args.num_workers,
                                shuffle=False,
                                pin_memory=True)

    # Setup Model
    model = get_model(args)

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_metric_value = checkpoint['best_metric_value']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if torch.cuda.is_available():
        model = torch.nn.DataParallel(model,
                                      device_ids=range(
                                          torch.cuda.device_count())).cuda()
        cudnn.benchmark = True

    if args.lr_policy == "MultiStepLR":
        scheduler = MultiStepLR(
            optimizer, milestones=[int(x) for x in args.milestones.split(',')])

    loss_viswindow = vis.line(X=torch.zeros((1, )).cpu(),
                              Y=torch.zeros((1, 2)).cpu(),
                              opts=dict(xlabel='Epochs',
                                        ylabel='Loss',
                                        title='Loss trough Epochs',
                                        legend=['Train', 'Val']))

    # Open log file
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    log_file = open(os.path.join(args.save_path, 'logs.txt'), 'w')
    log_header = 'epoch'
    log_header += ',train_loss'
    for m in args.metrics:
        log_header += ',train_' + m
    log_header += ',val_loss'
    for m in args.metrics:
        log_header += ',val_' + m
    log_file.write(log_header + '\n')

    # Main training loop
    for epoch in range(args.start_epoch, args.n_epoch):

        trainmetrics = train(trainloader, model, cross_entropy2d, optimizer,
                             epoch, args)
        args.split = 'val'
        valmetrics = validate(valloader, model, cross_entropy2d, epoch, args)
        if args.lr_policy == "MultiStepLR":
            scheduler.step()

        # Write log file
        log_line = '{}'.format(epoch)
        log_line += ',{:.3f}'.format(trainmetrics['loss'].avg)
        for m in trainmetrics['metrics'].meters:
            log_line += ',{:.3f}'.format(m.avg)
        log_line += ',{:.3f}'.format(valmetrics['loss'].avg)
        for m in valmetrics['metrics'].meters:
            log_line += ',{:.3f}'.format(m.avg)
        log_file.write(log_line + '\n')

        # Track loss trough epochs
        vis.line(X=torch.ones((1, 2)).cpu() * epoch,
                 Y=torch.Tensor(
                     [trainmetrics['loss'].avg,
                      valmetrics['loss'].avg]).unsqueeze(0).cpu(),
                 win=loss_viswindow,
                 update='append')

        # Take best and save model
        curr_metric_value = valmetrics['metrics'].meters[0].avg
        is_best = curr_metric_value > best_metric_value
        best_metric_value = max(curr_metric_value, best_metric_value)
        if epoch % args.save_every == 0 and epoch != 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'args': args,
                    'state_dict': model.module.state_dict(),
                    'best_metric_value': best_metric_value,
                    'optimizer': optimizer.state_dict(),
                },
                os.path.join(
                    args.save_path,
                    "{}_{}_{}.pth".format(args.arch, args.dataset, epoch)))
        if is_best:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'args': args,
                    'state_dict': model.module.state_dict(),
                    'best_metric_value': best_metric_value,
                    'optimizer': optimizer.state_dict(),
                }, os.path.join(args.save_path, 'model_best.pth.tar'))

    log_file.close()
Example #11
0
def main_worker(gpu, ngpus_per_node, cfg):
    cfg['GPU'] = gpu
    if gpu != 0:
        def print_pass(*args):
            pass
        builtins.print = print_pass
    cfg['RANK'] = cfg['RANK'] * ngpus_per_node + gpu
    dist.init_process_group(backend=cfg['DIST_BACKEND'], init_method = cfg["DIST_URL"], world_size=cfg['WORLD_SIZE'], rank=cfg['RANK'])

    # Data loading code
    batch_size = int(cfg['BATCH_SIZE'])
    per_batch_size = int(batch_size / ngpus_per_node)
    #workers = int((cfg['NUM_WORKERS'] + ngpus_per_node - 1) / ngpus_per_node) # dataload threads
    workers = int(cfg['NUM_WORKERS'])
    DATA_ROOT = cfg['DATA_ROOT'] # the parent root where your train/val/test data are stored
    VAL_DATA_ROOT = cfg['VAL_DATA_ROOT']
    RECORD_DIR = cfg['RECORD_DIR']
    RGB_MEAN = cfg['RGB_MEAN'] # for normalize inputs
    RGB_STD = cfg['RGB_STD']
    DROP_LAST = cfg['DROP_LAST']
    LR_SCHEDULER = cfg['LR_SCHEDULER']
    LR_STEP_SIZE = cfg['LR_STEP_SIZE']
    LR_DECAY_EPOCH = cfg['LR_DECAY_EPOCH']
    LR_DECAT_GAMMA = cfg['LR_DECAT_GAMMA']
    LR_END = cfg['LR_END']
    WARMUP_EPOCH = cfg['WARMUP_EPOCH']
    WARMUP_LR = cfg['WARMUP_LR']
    NUM_EPOCH = cfg['NUM_EPOCH']
    USE_APEX = cfg['USE_APEX']
    EVAL_FREQ = cfg['EVAL_FREQ']
    SYNC_BN = cfg['SYNC_BN']
    print("=" * 60)
    print("Overall Configurations:")
    print(cfg)
    print("=" * 60)
    transform_list = [transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    transforms.Normalize(mean = RGB_MEAN,std = RGB_STD),]
    if cfg['RANDOM_ERASING']:
        transform_list.append(RandomErasing())
    if cfg['CUTOUT']:
        transform_list.append(Cutout())
    train_transform = transforms.Compose(transform_list)
    if cfg['RANDAUGMENT']:
        train_transform.transforms.insert(0, RandAugment(n=cfg['RANDAUGMENT_N'], m=cfg['RANDAUGMENT_M']))
    dataset_train = FaceDataset(DATA_ROOT, RECORD_DIR, train_transform)
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train)
    train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=per_batch_size,
                                                shuffle = (train_sampler is None), num_workers=workers,
                                                pin_memory=True, sampler=train_sampler, drop_last=DROP_LAST)
    SAMPLE_NUMS = dataset_train.get_sample_num_of_each_class()
    NUM_CLASS = len(train_loader.dataset.classes)
    print("Number of Training Classes: {}".format(NUM_CLASS))

    lfw, cfp_fp, agedb_30, vgg2_fp, lfw_issame, cfp_fp_issame, agedb_30_issame, vgg2_fp_issame = get_val_data(VAL_DATA_ROOT)

    #======= model & loss & optimizer =======#
    BACKBONE_DICT = {'MobileFaceNet': MobileFaceNet,
                     'ResNet_50': ResNet_50, 'ResNet_101': ResNet_101, 'ResNet_152': ResNet_152,
                     'IR_50': IR_50, 'IR_100': IR_100, 'IR_101': IR_101, 'IR_152': IR_152, 'IR_185': IR_185, 'IR_200': IR_200,
                     'IR_SE_50': IR_SE_50, 'IR_SE_100': IR_SE_100, 'IR_SE_101': IR_SE_101, 'IR_SE_152': IR_SE_152, 'IR_SE_185': IR_SE_185, 'IR_SE_200': IR_SE_200,
                     'AttentionNet_IR_56': AttentionNet_IR_56,'AttentionNet_IRSE_56': AttentionNet_IRSE_56,'AttentionNet_IR_92': AttentionNet_IR_92,'AttentionNet_IRSE_92': AttentionNet_IRSE_92,
                     'PolyNet': PolyNet, 'PolyFace': PolyFace, 'EfficientPolyFace': EfficientPolyFace,
                     'ResNeSt_50': resnest50, 'ResNeSt_101': resnest101, 'ResNeSt_100': resnest100
                    } #'HRNet_W30': HRNet_W30, 'HRNet_W32': HRNet_W32, 'HRNet_W40': HRNet_W40, 'HRNet_W44': HRNet_W44, 'HRNet_W48': HRNet_W48, 'HRNet_W64': HRNet_W64

    BACKBONE_NAME = cfg['BACKBONE_NAME']
    INPUT_SIZE = cfg['INPUT_SIZE']
    assert INPUT_SIZE == [112, 112]
    backbone = BACKBONE_DICT[BACKBONE_NAME](INPUT_SIZE)
    print("=" * 60)
    print(backbone)
    print("{} Backbone Generated".format(BACKBONE_NAME))
    print("=" * 60)
    HEAD_DICT = {'Softmax': Softmax, 'ArcFace': ArcFace, 'Combined': Combined, 'CosFace': CosFace, 'SphereFace': SphereFace,
                 'Am_softmax': Am_softmax, 'CurricularFace': CurricularFace, 'ArcNegFace': ArcNegFace, 'SVX': SVXSoftmax, 
                 'AirFace': AirFace,'QAMFace': QAMFace, 'CircleLoss':CircleLoss
                }
    HEAD_NAME = cfg['HEAD_NAME']
    EMBEDDING_SIZE = cfg['EMBEDDING_SIZE'] # feature dimension
    head = HEAD_DICT[HEAD_NAME](in_features = EMBEDDING_SIZE, out_features = NUM_CLASS)
    print("Params: ", count_model_params(backbone))
    print("Flops:", count_model_flops(backbone))
    #backbone = backbone.eval()
    #print("Flops: ", flops_to_string(2*float(profile_macs(backbone.eval(), torch.randn(1, 3, 112, 112)))))
    #backbone = backbone.train()
    print("=" * 60)
    print(head)
    print("{} Head Generated".format(HEAD_NAME))
    print("=" * 60)


   #--------------------optimizer-----------------------------
    if BACKBONE_NAME.find("IR") >= 0:
        backbone_paras_only_bn, backbone_paras_wo_bn = separate_irse_bn_paras(backbone) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability
    else:
        backbone_paras_only_bn, backbone_paras_wo_bn = separate_resnet_bn_paras(backbone) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability

    LR = cfg['LR'] # initial LR
    WEIGHT_DECAY = cfg['WEIGHT_DECAY']
    MOMENTUM = cfg['MOMENTUM']
    optimizer = optim.SGD([
                            {'params': backbone_paras_wo_bn + list(head.parameters()), 'weight_decay': WEIGHT_DECAY},
                            {'params': backbone_paras_only_bn}
                            ], lr = LR, momentum = MOMENTUM)
    if LR_SCHEDULER == 'step':
        scheduler = StepLR(optimizer, step_size=LR_STEP_SIZE, gamma=LR_DECAT_GAMMA)
    elif LR_SCHEDULER == 'multi_step':
        scheduler = MultiStepLR(optimizer, milestones=LR_DECAY_EPOCH, gamma=LR_DECAT_GAMMA)
    elif LR_SCHEDULER == 'cosine':
        scheduler = CosineWarmupLR(optimizer, batches=len(train_loader), epochs=NUM_EPOCH, base_lr=LR, target_lr=LR_END, warmup_epochs=WARMUP_EPOCH, warmup_lr=WARMUP_LR)

    print("=" * 60)
    print(optimizer)
    print("Optimizer Generated")
    print("=" * 60)

    # loss
    LOSS_NAME = cfg['LOSS_NAME']
    LOSS_DICT = {'Softmax'      : nn.CrossEntropyLoss(),
                 'LabelSmooth'  : LabelSmoothCrossEntropyLoss(classes=NUM_CLASS),
                 'Focal'        : FocalLoss(),
                 'HM'           : HardMining(),
                 'Softplus'     : nn.Softplus()}
    loss = LOSS_DICT[LOSS_NAME].cuda(gpu)
    print("=" * 60)
    print(loss)
    print("{} Loss Generated".format(loss))
    print("=" * 60)

    torch.cuda.set_device(cfg['GPU'])
    backbone.cuda(cfg['GPU'])
    head.cuda(cfg['GPU'])

    #optionally resume from a checkpoint
    BACKBONE_RESUME_ROOT = cfg['BACKBONE_RESUME_ROOT'] # the root to resume training from a saved checkpoint
    HEAD_RESUME_ROOT = cfg['HEAD_RESUME_ROOT']  # the root to resume training from a saved checkpoint
    IS_RESUME = cfg['IS_RESUME']
    if IS_RESUME:
        print("=" * 60)
        if os.path.isfile(BACKBONE_RESUME_ROOT):
            print("Loading Backbone Checkpoint '{}'".format(BACKBONE_RESUME_ROOT))
            loc = 'cuda:{}'.format(cfg['GPU'])
            backbone.load_state_dict(torch.load(BACKBONE_RESUME_ROOT, map_location=loc))
            if os.path.isfile(HEAD_RESUME_ROOT):
                print("Loading Head Checkpoint '{}'".format(HEAD_RESUME_ROOT))
                checkpoint = torch.load(HEAD_RESUME_ROOT, map_location=loc)
                cfg['START_EPOCH'] = checkpoint['EPOCH']
                head.load_state_dict(checkpoint['HEAD'])
                optimizer.load_state_dict(checkpoint['OPTIMIZER'])
                del(checkpoint)
        else:
            print("No Checkpoint Found at '{}' and '{}'. Please Have a Check or Continue to Train from Scratch".format(BACKBONE_RESUME_ROOT, HEAD_RESUME_ROOT))
        print("=" * 60)
    ori_backbone = copy.deepcopy(backbone)
    if SYNC_BN:
        backbone = apex.parallel.convert_syncbn_model(backbone)
    if USE_APEX:
        [backbone, head], optimizer = amp.initialize([backbone, head], optimizer, opt_level='O2')
        backbone = DDP(backbone)
        head = DDP(head)
    else:
        backbone = torch.nn.parallel.DistributedDataParallel(backbone, device_ids=[cfg['GPU']])
        head = torch.nn.parallel.DistributedDataParallel(head, device_ids=[cfg['GPU']])

     # checkpoint and tensorboard dir
    MODEL_ROOT = cfg['MODEL_ROOT'] # the root to buffer your checkpoints
    LOG_ROOT = cfg['LOG_ROOT'] # the root to log your train/val status

    os.makedirs(MODEL_ROOT, exist_ok=True)
    os.makedirs(LOG_ROOT, exist_ok=True)

    writer = SummaryWriter(LOG_ROOT) # writer for buffering intermedium results
    # train
    for epoch in range(cfg['START_EPOCH'], cfg['NUM_EPOCH']):
        train_sampler.set_epoch(epoch)
        if LR_SCHEDULER != 'cosine':
            scheduler.step()
        #train for one epoch
        DISP_FREQ = 100  # 100 batch
        batch = 0  # batch index
        backbone.train()  # set to training mode
        head.train()
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        for inputs, labels in tqdm(iter(train_loader)):
            if LR_SCHEDULER == 'cosine':
                scheduler.step()
            # compute output
            start_time=time.time()
            inputs = inputs.cuda(cfg['GPU'], non_blocking=True)
            labels = labels.cuda(cfg['GPU'], non_blocking=True)

            if cfg['MIXUP']:
                    inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, cfg['GPU'], cfg['MIXUP_PROB'], cfg['MIXUP_ALPHA'])
                    inputs, labels_a, labels_b = map(Variable, (inputs, labels_a, labels_b))
            elif cfg['CUTMIX']:
                    inputs, labels_a, labels_b, lam = cutmix_data(inputs, labels, cfg['GPU'], cfg['CUTMIX_PROB'], cfg['MIXUP_ALPHA'])
                    inputs, labels_a, labels_b = map(Variable, (inputs, labels_a, labels_b))
            features = backbone(inputs)
            outputs = head(features, labels)

            if cfg['MIXUP'] or cfg['CUTMIX']:
                lossx = mixup_criterion(loss, outputs, labels_a, labels_b, lam)
            else:
                lossx = loss(outputs, labels) if HEAD_NAME != 'CircleLoss' else loss(outputs).mean()
            end_time = time.time()
            duration = end_time - start_time
            if ((batch + 1) % DISP_FREQ == 0) and batch != 0:
                print("batch inference time", duration)

            # compute gradient and do SGD step
            optimizer.zero_grad()
            if USE_APEX:
                with amp.scale_loss(lossx, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                lossx.backward()
            optimizer.step()

            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, labels, topk = (1, 5)) if HEAD_NAME != 'CircleLoss' else accuracy(features.data, labels, topk = (1, 5))
            losses.update(lossx.data.item(), inputs.size(0))
            top1.update(prec1.data.item(), inputs.size(0))
            top5.update(prec5.data.item(), inputs.size(0))
            # dispaly training loss & acc every DISP_FREQ
            if ((batch + 1) % DISP_FREQ == 0) or batch == 0:
                print("=" * 60)
                print('Epoch {}/{} Batch {}/{}\t'
                                'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                                'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                                'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                                    epoch + 1, cfg['NUM_EPOCH'], batch + 1, len(train_loader), loss = losses, top1 = top1, top5 = top5))
                print("=" * 60)

            # perform validation & save checkpoints per epoch
            # validation statistics per epoch (buffer for visualization)
            if (batch + 1) % EVAL_FREQ == 0:
                #lr = scheduler.get_last_lr()
                lr = optimizer.param_groups[0]['lr']
                print("Current lr", lr)
                print("=" * 60)
                print("Perform Evaluation on LFW, CFP_FP, AgeD and VGG2_FP, and Save Checkpoints...")
                accuracy_lfw, best_threshold_lfw, roc_curve_lfw = perform_val(EMBEDDING_SIZE, per_batch_size, backbone, lfw, lfw_issame)
                buffer_val(writer, "LFW", accuracy_lfw, best_threshold_lfw, roc_curve_lfw, epoch + 1)
                accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp = perform_val(EMBEDDING_SIZE, per_batch_size, backbone, cfp_fp, cfp_fp_issame)
                buffer_val(writer, "CFP_FP", accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp, epoch + 1)
                accuracy_agedb_30, best_threshold_agedb_30, roc_curve_agedb_30 = perform_val(EMBEDDING_SIZE, per_batch_size, backbone, agedb_30, agedb_30_issame)
                buffer_val(writer, "AgeDB", accuracy_agedb_30, best_threshold_agedb_30, roc_curve_agedb_30, epoch + 1)
                accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp = perform_val(EMBEDDING_SIZE, per_batch_size, backbone, vgg2_fp, vgg2_fp_issame)
                buffer_val(writer, "VGGFace2_FP", accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp, epoch + 1)
                print("Epoch {}/{}, Evaluation: LFW Acc: {}, CFP_FP Acc: {}, AgeDB Acc: {}, VGG2_FP Acc: {}".format(epoch + 1, NUM_EPOCH, accuracy_lfw, accuracy_cfp_fp, accuracy_agedb_30, accuracy_vgg2_fp))
                print("=" * 60)

                print("=" * 60)
                print("Save Checkpoint...")
                if cfg['RANK'] % ngpus_per_node == 0:
                    #torch.save(backbone.module.state_dict(), os.path.join(MODEL_ROOT, "Backbone_{}_Epoch_{}_Time_{}_checkpoint.pth".format(BACKBONE_NAME, epoch + 1, get_time())))
                    #save_dict = {'EPOCH': epoch+1,
                    #            'HEAD': head.module.state_dict(),
                    #            'OPTIMIZER': optimizer.state_dict()}
                    #torch.save(save_dict, os.path.join(MODEL_ROOT, "Head_{}_Epoch_{}_Time_{}_checkpoint.pth".format(HEAD_NAME, epoch + 1, get_time())))
                    ori_backbone.load_state_dict(backbone.module.state_dict())
                    ori_backbone.eval()
                    x = torch.randn(1,3,112,112).cuda()
                    traced_cell = torch.jit.trace(ori_backbone, (x))
                    #torch.save(ori_backbone, os.path.join(MODEL_ROOT, "model.pth"))
                    torch.jit.save(traced_cell, os.path.join(MODEL_ROOT, "Epoch_{}_Time_{}_checkpoint.pth".format(epoch + 1, get_time())))
            sys.stdout.flush()
            batch += 1 # batch index
        epoch_loss = losses.avg
        epoch_acc = top1.avg
        print("=" * 60)
        print('Epoch: {}/{}\t''Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                    epoch + 1, cfg['NUM_EPOCH'], loss = losses, top1 = top1, top5 = top5))
        sys.stdout.flush()
        print("=" * 60)
        if cfg['RANK'] % ngpus_per_node == 0:
            writer.add_scalar("Training_Loss", epoch_loss, epoch + 1)
            writer.add_scalar("Training_Accuracy", epoch_acc, epoch + 1)
            writer.add_scalar("Top1", top1.avg, epoch+1)
            writer.add_scalar("Top5", top5.avg, epoch+1)
Example #12
0
File: train.py Project: LLLskr/IRP
    return train_loader, val_loader


if __name__ == '__main__':

    models_path = os.path.join('./checkpoints', args.backend)  #保存网络参数的路径
    os.makedirs(models_path, exist_ok=True)

    train_loader, val_loader = get_dataloader()

    net, starting_epoch = build_network(args.snapshot, args.backend)
    optimizer = optim.Adam(net.parameters(), lr=args.start_lr)  #优化器
    # optimizer = optim.Adam(net.parameters())
    # optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9,
    #                weight_decay=1e-4)
    scheduler = MultiStepLR(
        optimizer, milestones=[int(x) for x in args.milestones.split(',')])

    seg_criterion = CrossEntropyLoss2d(weight=None)  #损失函数
    epoch_losses = []
    net.train()

    weight_save_path = "checkpoints/densenet/PSPNet_last"  #网络参数地址

    #加载网络参数
    try:
        net.load_state_dict(torch.load(weight_save_path))
        print("加载成功")
    except:
        print('加载失败')
    #开始训练
    Loss_list = []
Example #13
0
    def __init__(self,
                 model,
                 device,
                 config,
                 dataset,
                 dataloader,
                 optimizer,
                 stamp,
                 val_step=10,
                 detection=True,
                 caption=True,
                 orientation=False,
                 distance=False,
                 use_tf=True,
                 report_ap=False,
                 lr_decay_step=None,
                 lr_decay_rate=None,
                 bn_decay_step=None,
                 bn_decay_rate=None,
                 criterion="meteor"):

        self.epoch = 0  # set in __call__
        self.verbose = 0  # set in __call__

        self.model = model
        self.device = device
        self.config = config
        self.dataset = dataset
        self.dataloader = dataloader
        self.optimizer = optimizer
        self.stamp = stamp
        self.val_step = val_step

        self.detection = detection
        self.caption = caption
        self.orientation = orientation
        self.distance = distance
        self.use_tf = use_tf

        self.report_ap = report_ap

        self.lr_decay_step = lr_decay_step
        self.lr_decay_rate = lr_decay_rate
        self.bn_decay_step = bn_decay_step
        self.bn_decay_rate = bn_decay_rate

        self.criterion = criterion

        self.best = {
            "epoch": 0,
            "bleu-1": -float("inf"),
            "bleu-2": -float("inf"),
            "bleu-3": -float("inf"),
            "bleu-4": -float("inf"),
            "cider": -float("inf"),
            "rouge": -float("inf"),
            "meteor": -float("inf"),
            "sum": -float("inf")
        }

        # AP config
        self.POST_DICT = {
            "remove_empty_box": True,
            "use_3d_nms": True,
            "nms_iou": 0.25,
            "use_old_type_nms": False,
            "cls_nms": True,
            "per_class_proposal": True,
            "conf_thresh": 0.05,
            "dataset_config": self.config
        }

        self.AP_IOU_THRESHOLDS = [0.25, 0.5]
        self.AP_CALCULATOR_LIST = [
            APCalculator(iou_thresh, self.config.class2type)
            for iou_thresh in self.AP_IOU_THRESHOLDS
        ]

        # init log
        # contains all necessary info for all phases
        self.log = {"train": {}, "val": {}}

        # tensorboard
        os.makedirs(os.path.join(CONF.PATH.OUTPUT, stamp, "tensorboard/train"),
                    exist_ok=True)
        os.makedirs(os.path.join(CONF.PATH.OUTPUT, stamp, "tensorboard/val"),
                    exist_ok=True)
        self._log_writer = {
            "train":
            SummaryWriter(
                os.path.join(CONF.PATH.OUTPUT, stamp, "tensorboard/train")),
            "val":
            SummaryWriter(
                os.path.join(CONF.PATH.OUTPUT, stamp, "tensorboard/val"))
        }

        # training log
        log_path = os.path.join(CONF.PATH.OUTPUT, stamp, "log.txt")
        self.log_fout = open(log_path, "a")

        # private
        # only for internal access and temporary results
        self._running_log = {}
        self._global_iter_id = 0
        self._total_iter = {}  # set in __call__

        # templates
        self.__iter_report_template = ITER_REPORT_TEMPLATE
        self.__epoch_report_template = EPOCH_REPORT_TEMPLATE
        self.__best_report_template = BEST_REPORT_TEMPLATE

        # lr scheduler
        if lr_decay_step and lr_decay_rate:
            if isinstance(lr_decay_step, list):
                self.lr_scheduler = MultiStepLR(optimizer, lr_decay_step,
                                                lr_decay_rate)
            else:
                self.lr_scheduler = StepLR(optimizer, lr_decay_step,
                                           lr_decay_rate)
        else:
            self.lr_scheduler = None

        # bn scheduler
        if bn_decay_step and bn_decay_rate:
            it = -1
            start_epoch = 0
            BN_MOMENTUM_INIT = 0.5
            BN_MOMENTUM_MAX = 0.001
            bn_lbmd = lambda it: max(
                BN_MOMENTUM_INIT * bn_decay_rate**
                (int(it / bn_decay_step)), BN_MOMENTUM_MAX)
            self.bn_scheduler = BNMomentumScheduler(model,
                                                    bn_lambda=bn_lbmd,
                                                    last_epoch=start_epoch - 1)
        else:
            self.bn_scheduler = None
Example #14
0
logger.info("Loading data...")
label_loader, unlabel_loader, test_loader = cifar10(
        args.data_path, args.batch_size, args.num_workers, args.num_label, args.aug
        )

# Build model
logger.info("Building models...")
model = ConvLarge().cuda()
classifier = Classifier().cuda()
discriminator = Discriminator().cuda()

# Build optimizer and lr_scheduler
logger.info("Building optimizer and lr_scheduler...")
optimizer = SGD(chain(model.parameters(), classifier.parameters(), discriminator.parameters()),
                lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
lr_scheduler = MultiStepLR(optimizer, gamma=args.lr_decay,
                           milestones=[args.total_steps//2, args.total_steps*3//4])
   
# Build Beta distribution
logger.info("Building Beta distribution...")
beta_distribution = Beta(torch.tensor([args.alpha]), torch.tensor([args.alpha]))

# Optionally resume from a checkpoint
if args.resume is not None:
    if os.path.isfile(args.resume):
        logger.info("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        args.start_step = checkpoint['step']
        best_acc = checkpoint['best_acc']
        model.load_state_dict(checkpoint['model'])
        classifier.load_state_dict(checkpoint['classifier'])
        discriminator.load_state_dict(checkpoint['discriminator'])
Example #15
0
def main():
    global args, best_f1_score
    global logger

    # do transfer learning
    model = model_presets[args.arch][0](**model_presets[args.arch][1])

    # model.cuda()
    model = torch.nn.DataParallel(model).cuda()

    if args.optimizer.startswith('adam'):
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            # Only finetunable params
            lr=args.lr)
    elif args.optimizer.startswith('rmsprop'):
        optimizer = torch.optim.RMSprop(
            filter(lambda p: p.requires_grad, model.parameters()),
            # Only finetunable params
            lr=args.lr)
    elif args.optimizer.startswith('sgd'):
        optimizer = torch.optim.SGD(
            filter(lambda p: p.requires_grad, model.parameters()),
            # Only finetunable params
            lr=args.lr)
    else:
        raise ValueError('Optimizer not supported')

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_f1_score = checkpoint['best_f1_score']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint (epoch {})".format(
                checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # freeze the encoder
    print('Trainable param groups BEFORE freeze {}'.format(
        len(list(filter(lambda p: p.requires_grad,
                        model.module.parameters())))))
    model.module.freeze()
    print('Encoder frozen!')
    print('Trainable param groups AFTER freeze   {}'.format(
        len(list(filter(lambda p: p.requires_grad,
                        model.module.parameters())))))

    if args.predict:
        pass

    elif args.evaluate:

        val_augs = ValAugs(mean=model.module.mean, std=model.module.std)
        val_dataset = MapDataset(transforms=val_augs,
                                 mode='val',
                                 target_resl=(args.img_size, args.img_size),
                                 do_energy_levels=args.do_energy_levels,
                                 do_boundaries=args.do_boundaries)
        val_loader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=args.batch_size *
                                                 2,
                                                 shuffle=True,
                                                 num_workers=args.workers,
                                                 pin_memory=True,
                                                 drop_last=False)

        criterion = SemsegLoss(use_running_mean=args.do_running_mean,
                               bce_weight=args.bce_weight,
                               dice_weight=args.dice_weight).cuda()

        hard_dice = HardDice(threshold=args.ths)

        val_loss, val_bce_loss, val_dice_loss, val_hard_dice, val_ap, val_ar = validate(
            val_loader, model, criterion, hard_dice)
    else:
        if args.do_augs:
            train_augs = TrainAugs(prob=args.aug_prob,
                                   mean=model.module.mean,
                                   std=model.module.std)
        elif args.do_more_augs:
            train_augs = TrainAugsIaa(prob=args.aug_prob,
                                      mean=model.module.mean,
                                      std=model.module.std)
        else:
            train_augs = ValAugs(mean=model.module.mean, std=model.module.std)

        val_augs = ValAugs(mean=model.module.mean, std=model.module.std)

        train_dataset = MapDataset(transforms=train_augs,
                                   mode='train',
                                   target_resl=(args.img_size, args.img_size),
                                   do_energy_levels=args.do_energy_levels,
                                   do_boundaries=args.do_boundaries)

        val_dataset = MapDataset(transforms=val_augs,
                                 mode='val',
                                 target_resl=(args.img_size, args.img_size),
                                 do_energy_levels=args.do_energy_levels,
                                 do_boundaries=args.do_boundaries)

        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True,
                                                   drop_last=False)

        val_loader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=args.batch_size *
                                                 2,
                                                 shuffle=True,
                                                 num_workers=args.workers,
                                                 pin_memory=True,
                                                 drop_last=False)

        criterion = SemsegLoss(use_running_mean=args.do_running_mean,
                               bce_weight=args.bce_weight,
                               dice_weight=args.dice_weight).cuda()

        hard_dice = HardDice(threshold=args.ths)

        scheduler = MultiStepLR(optimizer,
                                milestones=[args.m1, args.m2],
                                gamma=0.1)

        for epoch in range(args.start_epoch, args.epochs):
            if epoch == args.m0:
                print('Trainable param groups BEFORE UNfreeze {}'.format(
                    len(
                        list(
                            filter(lambda p: p.requires_grad,
                                   model.module.parameters())))))
                model.module.unfreeze()
                print('Encoder unfrozen!')
                print('Trainable param groups AFTER UNfreeze {}'.format(
                    len(
                        list(
                            filter(lambda p: p.requires_grad,
                                   model.module.parameters())))))

                if args.optimizer.startswith('adam'):
                    optimizer = torch.optim.Adam(
                        filter(lambda p: p.requires_grad, model.parameters()),
                        # Only finetunable params
                        lr=args.lr)
                elif args.optimizer.startswith('rmsprop'):
                    optimizer = torch.optim.RMSprop(
                        filter(lambda p: p.requires_grad, model.parameters()),
                        # Only finetunable params
                        lr=args.lr)
                elif args.optimizer.startswith('sgd'):
                    optimizer = torch.optim.SGD(
                        filter(lambda p: p.requires_grad, model.parameters()),
                        # Only finetunable params
                        lr=args.lr)
                else:
                    raise ValueError('Optimizer not supported')

                # we are assuming that m0 <= m1
                scheduler = MultiStepLR(
                    optimizer,
                    milestones=[args.m1 - args.m0, args.m2 - args.m0],
                    gamma=0.1)

            # adjust_learning_rate(optimizer, epoch)

            # train for one epoch
            train_loss, train_bce_loss, train_dice_loss, train_hard_dice = train(
                train_loader, model, criterion, hard_dice, optimizer, epoch)

            # evaluate on validation set
            val_loss, val_bce_loss, val_dice_loss, val_hard_dice, val_ap, val_ar = validate(
                val_loader, model, criterion, hard_dice)

            val_f1 = 2 / (1 / val_ap + 1 / val_ar)

            scheduler.step()

            #============ TensorBoard logging ============#
            # Log the scalar values
            if args.tensorboard:
                info = {
                    'eph_tr_loss': train_loss,
                    'eph_tr_bce_loss': train_bce_loss,
                    'eph_tr_dice_loss': train_dice_loss,
                    'eph_tr_hard_dice': train_hard_dice,
                    'eph_val_loss': val_loss,
                    'eph_val_bce_loss': val_bce_loss,
                    'eph_val_dice_loss': val_dice_loss,
                    'eph_val_hard_dice': val_hard_dice,
                    'eph_val_f1_score': val_f1,
                    'eph_val_ap': val_ap,
                    'eph_val_ar': val_ar,
                }
                for tag, value in info.items():
                    logger.scalar_summary(tag, value, epoch + 1)

            # remember best prec@1 and save checkpoint
            is_best = val_f1 > best_f1_score
            best_f1_score = max(val_f1, best_f1_score)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'optimizer': optimizer.state_dict(),
                    'state_dict': model.state_dict(),
                    'best_f1_score': best_f1_score,
                }, is_best,
                'weights/{}_checkpoint.pth.tar'.format(str(args.lognumber)),
                'weights/{}_best.pth.tar'.format(str(args.lognumber)))
Example #16
0
def main():
	parser = argparse.ArgumentParser(
		description='FCOS Detector Training With Pytorch')

	parser.add_argument(
		'--dataset-style', type=str, required=True,
		help="style of dataset (supported are 'pascal-voc' and 'coco')")
	parser.add_argument('--dataset', required=True, help='dataset path')
	parser.add_argument(
		'--train-image-set', type=str, default="train",
		help='image set (annotation file basename for COCO) '
		'to use for training')
	parser.add_argument(
		'--val-image-set', type=str, default="val",
		help='image set (annotation file basename for COCO) '
		'to use for validation')
	parser.add_argument(
		'--val-dataset', default=None,
		help='separate validation dataset directory path')

	parser.add_argument(
		'--net-config',
		help="path to network architecture configuration file "
		"(take a look into 'preset' directory for the reference)")

	# Params for optimizer
	parser.add_argument(
		'--optimizer', default="ranger",
		help="optimizer to use ('sgd', 'diffgrad', 'adamw', or 'ranger')")
	parser.add_argument(
		'--lr', '--learning-rate', default=1e-3, type=float,
		help='initial learning rate')
	parser.add_argument(
		'--momentum', default=0.9, type=float,
		help='optional momentum for SGD optimizer (default is 0.9)')
	parser.add_argument(
		'--weight-decay', default=5e-4, type=float,
		help='optional weight decay (L2 penalty) '
		'for SGD optimizer (default is 5e-4)')

	parser.add_argument('--backbone-pretrained', action='store_true')
	parser.add_argument(
		'--backbone-weights',
		help='pretrained weights for the backbone model')
	parser.add_argument('--freeze-backbone', action='store_true')

	# Scheduler
	parser.add_argument(
		'--scheduler', default="cosine-wr", type=str,
		help="scheduler for SGD. It can one of 'multi-step' and 'cosine-wr'")

	# Params for Scheduler
	parser.add_argument(
		'--milestones', default="70,100", type=str,
		help="milestones for MultiStepLR")
	parser.add_argument(
		'--t0', default=10, type=int,
		help='T_0 value for Cosine Annealing Warm Restarts.')
	parser.add_argument(
		'--t-mult', default=2, type=float,
		help='T_mult value for Cosine Annealing Warm Restarts.')

	# Train params
	parser.add_argument('--batch-size', default=32, type=int, help='batch size')
	parser.add_argument(
		'--num-epochs', default=120, type=int, help='number of epochs to train')
	parser.add_argument(
		'--num-workers', default=4, type=int,
		help='number of workers used in dataloading')
	parser.add_argument(
		'--val-epochs', default=5, type=int,
		help='perform validation every this many epochs')
	parser.add_argument(
		'--device', type=str,
		help='device to use for training')

	parser.add_argument(
		'--checkpoint-path', default='output',
		help='directory for saving checkpoint models')


	logging.basicConfig(
		stream=sys.stdout, level=logging.INFO,
		format='%(asctime)s - %(levelname)s - %(message)s')

	args = parser.parse_args()
	logging.info(args)

	if args.device is None:
		device = "cuda" if torch.cuda.is_available() else "cpu"
	else:
		device = args.device

	if device.startswith("cuda"):
		logging.info("Use CUDA")

	timer = Timer()

	arch = get_arch(args.net_config)

	bbox_format = dataset_bbox_format(args.dataset_style)

	train_mean, train_std = mean_std(
		args.dataset_style,
		args.dataset,
		args.train_image_set)

	train_transform = processing.train.Pipeline(
		[arch.image_size] * 2,
		train_mean, train_std,
		bbox_format=bbox_format)

	if args.val_dataset is not None:
		val_dataset_root = args.val_dataset
	else:
		val_dataset_root = args.dataset

	val_mean, val_std = mean_std(
		args.dataset_style,
		val_dataset_root,
		args.val_image_set)

	val_transform = processing.test.Pipeline(
		[arch.image_size] * 2,
		val_mean, val_std,
		bbox_format=bbox_format)

	logging.info("Loading datasets...")

	dataset = load_dataset(
			args.dataset_style,
			args.dataset,
			args.train_image_set,
			train_transform)

	num_classes = len(dataset.class_names)

	logging.info("Train dataset size: {}".format(len(dataset)))

	# don't allow the last batch be of length 1
	# to not lead our dear BatchNorms to crash on that
	drop_last = len(dataset) % args.batch_size > 0

	train_loader = DataLoader(
		dataset, args.batch_size, collate_fn=collate,
		num_workers=args.num_workers,
		shuffle=True, drop_last=drop_last)

	val_dataset = load_dataset(
			args.dataset_style,
			val_dataset_root,
			args.val_image_set,
			val_transform)

	logging.info("Validation dataset size: {}".format(len(val_dataset)))

	val_loader = DataLoader(
		val_dataset, args.batch_size, collate_fn=collate,
		num_workers=args.num_workers,
		shuffle=False, drop_last=drop_last)

	logging.info("Building network")
	backbone_pretrained = args.backbone_pretrained is not None
	net = arch.build(num_classes, backbone_pretrained, args.batch_size)

	if backbone_pretrained and args.backbone_weights is not None:
		logging.info(f"Load backbone weights from {args.backbone_weights}")
		timer.start("Loading backbone model")
		net.load_backbone_weights(args.backbone_weights)
		logging.info(f'Took {timer.end("Loading backbone model"):.2f}s.')

	if args.freeze_backbone:
		net.freeze_backbone()

	net.to(device)

	last_epoch = -1

	criterion = arch.loss(net, device)
	mapper = arch.mapper(net, device)

	optim_kwargs = {
		"lr": args.lr,
		"weight_decay": args.weight_decay
	}

	if args.optimizer == "sgd":
		optim_class = torch.optim.SGD
		optim_kwargs.update({
			"momentum": args.momentum
		})
	elif args.optimizer == "adamw":
		optim_class = torch.optim.AdamW
	elif args.optimizer == "diffgrad":
		optim_class = DiffGrad
	else:
		optim_class = Ranger

	optimizer = optim_class(net.parameters(), **optim_kwargs)
	logging.info(f"Optimizer parameters used: {optim_kwargs}")

	if args.scheduler == 'multi-step':
		logging.info("Uses MultiStepLR scheduler.")
		milestones = [int(v.strip()) for v in args.milestones.split(",")]
		scheduler = MultiStepLR(
			optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch)
	else:
		logging.info("Uses Cosine annealing warm restarts scheduler.")
		scheduler = CosineAnnealingWarmRestarts(
			optimizer, T_0=args.t0, T_mult=args.t_mult, eta_min=1e-5)

	os.makedirs(args.checkpoint_path, exist_ok=True)

	logging.info(f"Start training from epoch {last_epoch + 1}.")
	for epoch in range(last_epoch + 1, args.num_epochs):
		loop(
			train_loader, net, mapper, criterion,
			optimizer, device=device, epoch=epoch)
		scheduler.step()

		if (epoch > 0 and epoch % args.val_epochs == 0 or
				epoch == args.num_epochs - 1):
			val_loss = loop(
				val_loader, net, mapper, criterion,
				device=device, epoch=epoch)

			filename = f"{arch.name}-Epoch-{epoch}-Loss-{val_loss}.pth"
			model_path = os.path.join(args.checkpoint_path, filename)
			save(arch, net, dataset.class_names, model_path)
			logging.info(f"Saved model {model_path}")
Example #17
0
if args.resume:
    # Load checkpoint.
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/ckpt.t7')
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(),
                      lr=args.lr,
                      momentum=0.9,
                      weight_decay=5e-4)
scheduler = MultiStepLR(optimizer, milestones=[150, 250], gamma=0.1)


# Training
def train(epoch):
    scheduler.step()
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
    
    n_k = 200
    net_input_kernel = get_noise(n_k, INPUT, (1, 1)).type(dtype)
    net_input_kernel.squeeze_()

    net_kernel = Predictor(1, 64, opt.kernel_size[0]*opt.kernel_size[1])

    net_kernel = net_kernel.type(dtype)

    # Losses
    mse = torch.nn.MSELoss().type(dtype)
    ssim = SSIM().type(dtype)

    # optimizer
    optimizer = torch.optim.Adam([{'params':net.parameters()},{'params':net_kernel.parameters(),'lr':1e-4}], lr=LR)
    scheduler = MultiStepLR(optimizer, milestones=[2000, 3000, 4000], gamma=0.5)  # learning rates

    #
    net_input_saved = net_input.detach().clone()
    net_input_kernel_saved = net_input_kernel.detach().clone()

    ### start SelfDeblur
    for step in tqdm(range(num_iter)):

        # input regularization
        net_input = net_input_saved + reg_noise_std*torch.zeros(net_input_saved.shape).type_as(net_input_saved.data).normal_()
        # net_input_kernel = net_input_kernel_saved + reg_noise_std*torch.zeros(net_input_kernel_saved.shape).type_as(net_input_kernel_saved.data).normal_()

        # change the learning rate
        scheduler.step(step)
        optimizer.zero_grad()
Example #19
0
def main(cfg):
	tensorboard_dir = os.path.join(cfg.SAVE_DIR, "tb_event")
	if not os.path.exists(cfg.SAVE_DIR):
		os.makedirs(cfg.SAVE_DIR)
	else:
		print("This directory has already existed, Please remember to modify your configs")
		if not click.confirm(
			"\033[1;31;40mContinue and override the former directory?\033[0m",
			default=False,
			):
			exit(0)
		if tensorboard_dir is not None and os.path.exists(tensorboard_dir):
			shutil.rmtree(tensorboard_dir)
	print("=> output model will be saved in {}".format(cfg.SAVE_DIR))
	tb_writer = SummaryWriter(tensorboard_dir)

	model = NTS.attention_net(cfg, CAT_NUM=cfg.NET.CAT_NUM, topN=cfg.NET.PROPOSAL_NUM)
	print(model)

	# special for NTS
	raw_parameters = list(model.pretrained_model.parameters())
	part_parameters = list(model.proposal_net.parameters())
	concat_parameters = list(model.concat_net.parameters())
	partcls_parameters = list(model.partcls_net.parameters())
	
	raw_optimizer = torch.optim.SGD(raw_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
	concat_optimizer = torch.optim.SGD(concat_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
	part_optimizer = torch.optim.SGD(part_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
	partcls_optimizer = torch.optim.SGD(partcls_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)

	param_num = sum([p.data.nelement() for p in model.parameters()])
	print("Number of model parameters: {} M".format(param_num / 1024 / 1024))

	model = model.cuda()
	model = DataParallel(model)
	model.train()

	train_data_loader, dev_data_loader = get_iwildcam_loader(params, mode=params['mode']) # train/eval的dataloader

	if params['lr_schedule'] == "Step":# True
		# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=params['lr_decay_epochs'], gamma=0.2)
		schedulers = [MultiStepLR(raw_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1),
                    MultiStepLR(concat_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1),
                    MultiStepLR(part_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1),
                    MultiStepLR(partcls_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1)]
	elif params['lr_schedule'] == "Cosine":
		schedulers = [CosineAnnealingLR(raw_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06),
                    CosineAnnealingLR(concat_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06),
                    CosineAnnealingLR(part_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06),
                    CosineAnnealingLR(partcls_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06)
		]
	
	best_acc, best_f1, best_epoch, start_epoch = 0, 0, 0, 1
	# ------ Begin Resume -------
	if cfg.RESUME:
		load_ckpt(cfg.SAVE_DIR) # read history parameters from json
		ckpt = torch.load(cfg.INIT_MODEL, map_location="cuda") # already specify in load_params()
		print('=> Load checkpoint from ', cfg.INIT_MODEL)
		model.load_state_dict(ckpt['state_dict'])
		raw_optimizer.load_state_dict(ckpt['raw_optimizer'])
		part_optimizer.load_state_dict(ckpt['part_optimizer'])
		concat_optimizer.load_state_dict(ckpt['concat_optimizer'])
		partcls_optimizer.load_state_dict(ckpt['partcls_optimizer'])
		# optimizer.load_state_dict(ckpt['optimizer'])
		scheduler.load_state_dict(ckpt['schduler']) # FIXME: to check
		start_epoch = ckpt['epoch'] + 1
		# best_acc = ckpt['best_acc']
		best_f1 = ckpt['best_f1']
		best_epoch = ckpt['best_epoch']

	if cfg.LOSS.LOSS_TYPE == 'CE':
		criterion = cross_entropy(func_type='softmax').to(device)
		if cfg.LOSS.WEIGHT_PER_CLS:
			CE = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(cfg.LOSS.WEIGHT_PER_CLS).float().to(device))
		label_type = 'float'
	elif cfg.LOSS.LOSS_TYPE == 'Sigmoid_CE':
		criterion = cross_entropy(func_type='sigmoid').to(device)
		label_type = 'float'		
	elif cfg.LOSS.LOSS_TYPE == 'Focal':
		criterion = focal_loss(gamma=1.0, alpha=1.0).to(device)
		label_type = 'long'
	elif cfg.LOSS.LOSS_TYPE == 'CB_loss': # FIXME: this is unsure implementation, low score
		criterion = cb_loss(cfg.LOSS.SAMPLES_PER_CLS, cfg.NUM_CLASSES, 'softmax').to(device)
		label_type = 'float'
	else:
		raise NotImplementedError("Not accessible loss type for: {}".format(cfg.LOSS.LOSS_TYPE))

	t0 = time()
	t1 = time()
	it = 0
	print('[INFO]Begin to train')
	use_onehot = cfg.LOSS.LOSS_TYPE != 'Focal'
	for epoch in range(start_epoch, cfg.TRAIN.EPOCHS + 1):
		print('=> Current Lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
		if cfg.TRAIN.LR_SCHEDULE:
			scheduler.step()

		train_loader = data_prefetcher(train_data_loader, label_type)
		inputs, labels, ids = train_loader.next() # ids没有用到
		i = 0
		batch_time = AverageMeter('Time', ':6.3f')
		data_time = AverageMeter('Data', ':6.3f')
		losses = AverageMeter('Loss', ':.4e')
		train_acc = AverageMeter('Acc', ':6.2f')
		train_f1 = AverageMeter('F1', ':6.2f')
		progress = ProgressMeter(
			len(train_data_loader),
			[batch_time, data_time, losses, train_acc, train_f1],
			prefix="Epoch: [{}]".format(epoch))

		while inputs is not None:
			bs = inputs.size(0)
			# mixup_now = np.random.random() < cfg.AUG.AUG_PROBA # 0.5 一半的概率mixup
			# if cfg.AUG.MIXUP and mixup_now: # True & 一半的概率
			# 	inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, cfg.AUG.MIXUP_ALPHA)

			raw_logits, concat_logits, part_logits, _, top_n_prob = model(inputs)

			# optimizer.zero_grad()
			raw_optimizer.zero_grad()
			part_optimizer.zero_grad()
			concat_optimizer.zero_grad()
			partcls_optimizer.zero_grad()

			raw_logits, concat_logits, part_logits, _, top_n_prob = model(inputs)
			if cfg.AUG.MIXUP and mixup_now:
				# TODO: to implement NTS with mixup
				# loss = mixup_criterion(criterion, output, labels_a, labels_b, lam) # mixup之后的图片也要根据mixup的obj算loss
				pass
			else:
				part_loss = NTS.list_loss(
					part_logits.view(bs * cfg.NET.PROPOSAL_NUM, -1),
					labels.max(axis=1)[1].unsqueeze(1).repeat(1, cfg.NET.PROPOSAL_NUM).view(-1)).view(bs, cfg.NET.PROPOSAL_NUM)
				raw_loss = criterion(raw_logits, labels)
				concat_loss = criterion(concat_logits, labels)
				rank_loss = NTS.ranking_loss(top_n_prob, part_loss, proposal_num=cfg.NET.PROPOSAL_NUM)

				CE = torch.nn.CrossEntropyLoss()
				partcls_loss = CE(
					part_logits.view(bs * cfg.NET.PROPOSAL_NUM, -1),
					labels.max(axis=1)[1].unsqueeze(1).repeat(1, cfg.NET.PROPOSAL_NUM).view(-1))
					# part_logits, (256,6,209) => (1536,209)
					# labels: (1536,)
				total_loss = raw_loss + rank_loss + concat_loss + partcls_loss

			total_loss.backward()

			raw_optimizer.step()
			part_optimizer.step()
			concat_optimizer.step()
			partcls_optimizer.step()

			if i % cfg.PRINT_STEP == 0:
				preds = np.argmax(concat_logits.cpu().detach().numpy(), axis=1) # argmax on logits
				if use_onehot:
					targets = np.argmax(labels.cpu().detach().numpy(), axis=1)
				else:
					targets = labels.cpu().detach().numpy()
				acc = metrics.accuracy_score(targets, preds)
				loss = concat_loss
				loss_val = loss.item()
				f1 = metrics.f1_score(targets,preds,average='macro')
				# train_log.append([epoch,i, loss_val, acc, f1])
				# print("epoch: %d, iter: %d, train_loss: %.4f, train_acc: %.4f, train_f1: %.4f, lr_rate: %.1e, time_cost_per_iter: %.4f s" % ( \
				# 	epoch, i, loss_val, acc, f1, (raw_optimizer.param_groups[0]['lr']), (time() - t1)/params['print_step']))
				tb_writer.add_scalar('train_loss', loss_val, it)
				# with open(params['log_dir'] + 'train.tsv', 'a') as f:
				# 	f.write('%05d\t%05d\t%f\t%f\t%f\n' % (epoch, i, loss_val, acc, f1))
				t1 = time()

			if (i+1) % params['eval_step'] == 0: # 95
				t2=time()
				model.eval()
				data_loader = data_prefetcher(dev_data_loader,label_type)
				loss_val, acc, f1 = evaluate(model, data_loader, criterion, use_onehot)
				model.train()
				dev_log.append([epoch, i, acc, f1])

				if f1 > best_f1:
					best_acc, best_f1, best_iter, best_epoch = acc, f1, i, epoch
				print('[Evaluation] -------------------------------')
				print("epoch: %d, test acc: %.4f, f1-score: %.4f, best-f1-score: %.4f, eval_time: %.4f s" % (
					epoch, acc, f1, best_f1,time()-t2))
				print('[Evaluation] -------------------------------')
				tb_writer.add_scalar('val_metrics/val_acc', acc, it)
				tb_writer.add_scalar('val_metrics/val_f1-score', f1, it)
				tb_writer.add_scalar('val_metrics/val_loss', loss_val, it)
				with open(params['log_dir'] + 'eval.tsv', 'a') as f:
					f.write('%05d\t%05d\t%f\t%f\n' % (epoch, i, acc, f1))
				
				save_model_path= os.path.join(params['save_dir'], 'model_%d_%d.pkl' % (epoch, i))
				# torch.save(model, save_model_path) # FIXME: this is bad for multi-gpu, use below instead
				torch.save({
					'state_dict': model.module.state_dict(),
					'schduler': scheduler.state_dict(),
					'raw_optimizer': raw_optimizer.state_dict(),
					'part_optimizer': part_optimizer.state_dict(),
					'concat_optimizer': concat_optimizer.state_dict(),
					'partcls_optimizer': partcls_optimizer.state_dict(),
					}, save_model_path)
				print('[INFO]save model to', save_model_path)

			inputs, labels, ids = train_loader.next()
			i += 1
			it += 1

	print("[INFO]Train is over, Time cost: %.1f hours..." % ((time()-t0) / 3600))
	# copy best_f1 model to model_best.pkl
	source = 'model_%d_%d.pkl' % (best_epoch, best_iter)
	source_path = os.path.join(params['save_dir'], source)
	target = 'model_best.pkl'
	target_path = os.path.join(params['save_dir'], target)
	try:
		shutil.copy(source_path, target_path)
		print("Save best model to {}: [epoch-iter: {:d}-{:d}/ f1-score: {:.4f}]".format(target_path, best_epoch, best_iter, best_f1))
	except IOError as e:
		print("Unable to copy file. %s" % e)
	except:
		print("Unexpected error:", sys.exc_info())

	# ---- Delete Useless ckpt
	ckpts = sorted(name for name in os.listdir(params['save_dir']) if name.startswith('model'))
	ckpts = ckpts[:-1]
	print("=> Start to clean checkpoint from {} to {}".format(ckpts[0], ckpts[-1]))
	for name in ckpts:
		os.remove(os.path.join(params['save_dir'], name))
        return len(self.imgs)

if __name__ == "__main__":
    model = 
    dataset = SelfDrivingDataset()
    dataset_size = len(dataset)
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8* dataset_size), int(0.2*dataset_size)])
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True
    )
    lr = 1e-4
    weight_decay = 1e-5
    lr_scheduler = MultiStepLR(optimizer, milestones=[30, 50], gamma=0.1)
    optimizer = optim.Adam(model.parameters(),
                       lr=lr,
                       weight_decay=weight_decay)
    criterion = nn.MSELoss()
    train_loss = 0.
    for epoch in range(epochs):
        for i, imgs, labels in enumerate(train_loader):
            imgs, labels = imgs.to(device), labels.to(device)
            optimizer.zero_grad()
            output = model(imgs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data.item()
        
Example #21
0
if args.resume:
    # Load checkpoint.
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/ckpt.t7')
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(),
                      lr=args.lr,
                      momentum=0.9,
                      weight_decay=5e-4)
scheduler = MultiStepLR(optimizer, milestones=[20, 40], gamma=0.1)


# Training
def train(epoch):
    scheduler.step()
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
Example #22
0
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    acc = 100 * correct / len(test_loader.dataset)
    print('Accuracy: {}%\n'.format(acc))
    return acc


if __name__ == '__main__':
    # model = MobileNetV2(n_class=10).to(device)
    model = VGG().to(device)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.1,
                                momentum=0.9,
                                weight_decay=5e-4)
    scheduler = MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1)
    criterion = torch.nn.CrossEntropyLoss()

    for i in range(100):
        trainer(model, optimizer, criterion, i)
    pre_best_acc = evaluator(model)

    dummy_input = torch.rand(10, 3, 32, 32).to(device)
    pre_flops, pre_params, _ = count_flops_params(model, dummy_input)

    config_list = [{
        'op_types': ['Conv2d'],
        'total_sparsity': 0.5,
        'max_sparsity_per_layer': 0.8
    }]
Example #23
0
                             center_variance=0.1,
                             size_variance=0.2,
                             device=DEVICE)
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    logging.info(
        f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, " +
        f"Extra Layers learning rate: {extra_layers_lr}.")

    if args.scheduler == 'multi-step':
        logging.info("Uses MultiStepLR scheduler.")
        milestones = [int(v.strip()) for v in args.milestones.split(",")]
        scheduler = MultiStepLR(optimizer,
                                milestones=milestones,
                                gamma=0.1,
                                last_epoch=last_epoch)
    elif args.scheduler == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(optimizer,
                                      args.t_max,
                                      last_epoch=last_epoch)
    else:
        logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
        parser.print_help(sys.stderr)
        sys.exit(1)

    logging.info("Initialize summarywriter")
    writer = SummaryWriter()
    logging.info(f"Start training from epoch {last_epoch + 1}.")
    for epoch in range(last_epoch + 1, args.num_epochs):
Example #24
0
    modelClass = Models.__dict__[args.model]
    model = modelClass(args)

    # Load preTrained weights.
    logging.info('==> Resuming from checkpoint..')
    model.loadPreTrained(args.pre_trained, 'cpu')
    model = model.cuda()

    criterion = corrLoss(args).cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.decay,
                                nesterov=True)
    scheduler = MultiStepLR(optimizer,
                            milestones=args.schedule,
                            gamma=args.gamma)

    # log command line
    logging.info('CommandLine: {} PID: {} '
                 'Hostname: {} CUDA_VISIBLE_DEVICES {}'.format(
                     argv, getpid(), gethostname(),
                     environ.get('CUDA_VISIBLE_DEVICES')))

    # # Weights quantization
    # if args.weightBitwidth < 32 and not args.fold:
    #     model_path = './qmodels'
    #     if not os.path.exists(model_path):
    #         os.makedirs(model_path)
    #     model_path = os.path.join(model_path, args.model + ('_kmeans%dbit.pt' % args.weightBitwidth))
    #     if not os.path.exists(model_path):
Example #25
0
        # model = torch.load(os.path.join(save_dir, 'model_%03d.pth' % initial_epoch))

    model.train()
    # criterion = nn.MSELoss(reduction = 'sum')  # PyTorch 0.4.1
    # criterion = sum_squared_error()
    criterion = nn.L1Loss()
    chk = nn.MSELoss()

    if cuda:
        model = model.cuda()
        # device_ids = [0]
        # model = nn.DataParallel(model, device_ids=device_ids).cuda()
        # criterion = criterion.cuda()

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = MultiStepLR(optimizer, milestones=[150, 200, 2100],
                            gamma=0.2)  # learning rates
    for epoch in range(initial_epoch, n_epoch):

        scheduler.step(epoch)  # step to the learning rate in this epcoh
        # xs = dg.datagenerator(data_dir=args.train_data)
        # xs = xs.astype('float32') / 255.0
        # xs = torch.from_numpy(xs.transpose((0, 3, 1, 2)))  # tensor of the clean patches, NXCXHXW
        #
        # DDataset = DenoisingDataset(xs, sigma)
        # batch_y, batch_x = DDataset[:238336]

        # fig = plt.figure()
        # gs = GridSpec(nrows=1, ncols=2)
        #
        # plot1 = fig.add_subplot(gs[0, 0])
        # plot2 = fig.add_subplot(gs[0, 1])
Example #26
0
    BATCH_SIZE = 16

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu:0')

    dataset = CustomCIFAR10(train=True)
    data_loader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=2, shuffle=True)
    # 128 of minibatch size was used for the paper.

    pnet = PlainNetwork(3).to(device)
    resnet = ResidualNetwork(3).to(device)

    criterion = nn.CrossEntropyLoss()

    optim_pnet = torch.optim.SGD(pnet.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
    optim_resnet = torch.optim.SGD(resnet.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
    scheduler_pnet = MultiStepLR(optim_pnet, milestones=[32000, 48000], gamma=0.1)
    scheduler_resnet = MultiStepLR(optim_resnet, milestones=[32000, 48000], gamma=0.1)

    iter_total = 0
    while iter_total < 640000:
        for input, label in data_loader:
            iter_total += 1
            input, label = input.to(device), label.to(device)

            output = pnet(input)

            loss = criterion(output, label)
            optim_resnet.zero_grad()
            loss.backward()
            optim_resnet.step()
            scheduler_resnet.step()
Example #27
0
        'params': model.gp_layer.hyperparameters(),
        'lr': lr * 0.01
    },
    {
        'params': model.gp_layer.variational_parameters()
    },
    {
        'params': likelihood.parameters()
    },
],
                lr=lr,
                momentum=0.9,
                nesterov=True,
                weight_decay=0)
scheduler = MultiStepLR(optimizer,
                        milestones=[0.5 * n_epochs, 0.75 * n_epochs],
                        gamma=0.1)


def validation(data_loader, dataset_type='Validation'):
    model.eval()
    likelihood.eval()

    correct = 0
    for data, target in data_loader:
        data, target = data.cuda(), target.cuda()
        with torch.no_grad():
            output = likelihood(model(data))
            pred = output.probs.argmax(1)
            correct += float(pred.eq(target.view_as(pred)).cpu().sum())
    accuracy = round(100. * correct / float(len(data_loader.dataset)), 3)
def main(exp, frame_sizes, dataset, **params):
    scheduler = True
    use_cuda = torch.cuda.is_available()
    print('Start Sample-RNN')
    params = dict(
        default_params,
        exp=exp, frame_sizes=frame_sizes, dataset=dataset,
        **params
    )
    seed = params.get('seed')
    init_random_seed(seed, use_cuda)

    results_path = setup_results_dir(params)
    tee_stdout(os.path.join(results_path, 'log'))

    spk_dim = len([i for i in os.listdir(os.path.join(params['datasets_path'], params['dataset']))
                   if os.path.islink(os.path.join(params['datasets_path'], params['dataset']) + '/' + i)])

    print('Create model')
    model = SampleRNN(
        frame_sizes=params['frame_sizes'],
        n_rnn=params['n_rnn'],
        dim=params['dim'],
        learn_h0=params['learn_h0'],
        q_levels=params['q_levels'],
        ulaw=params['ulaw'],
        weight_norm=params['weight_norm'],
        cond_dim=params['cond_dim']*(1+params['look_ahead']),
        spk_dim=spk_dim,
        qrnn=params['qrnn']
    )
    if use_cuda:
        model = model.cuda()
        predictor = Predictor(model).cuda()
    else:
        predictor = Predictor(model)

    print('Done!')
    f_name = params['model']
    if f_name is not None:
        print('pre train with', f_name)
        model_data = load_model(f_name)
        if model_data is None:
            sys.exit('ERROR: Model not found in' + str(f_name))
        (state_dict, epoch_index, iteration) = model_data
        print('OK: Read model', f_name, '(epoch:', epoch_index, ')')
        print(state_dict)
        predictor.load_state_dict(state_dict)
    print('predictor', predictor)
    for name, param in predictor.named_parameters():
        print(name, param.size())

    optimizer = torch.optim.Adam(predictor.parameters(), lr=params['learning_rate'])
    if params['scheduler']:
        scheduler = MultiStepLR(optimizer, milestones=[15, 35], gamma=0.1)
    optimizer = gradient_clipping(optimizer)
    print('Saving results in path', results_path)
    print('Read data')
    data_loader = make_data_loader(model.lookback, params)
    print('Done!')
    data_model = data_loader('train')

    show_dataset = False
    if show_dataset:
        for i, full in enumerate(data_model):
            print('Data Loader---------------------------------------')
            print('batch', i)
            (data, reset, target, cond) = full           
            print('Data', data.size())
            print('Target', target.size())

    if not params['scheduler']:    
        scheduler = None
    if use_cuda:
        cuda = True
    else:
        cuda = False
    writer = SummaryWriter(log_dir='sample_board')
    trainer = Trainer(
        predictor, sequence_nll_loss_bits, optimizer,  data_model, cuda, writer, scheduler

    )

    checkpoints_path = os.path.join(results_path, 'checkpoints')
    checkpoint_data = load_last_checkpoint(checkpoints_path)
    if checkpoint_data is not None:
        (state_dict, epoch, iteration) = checkpoint_data
        trainer.epochs = epoch
        trainer.iterations = iteration
        predictor.load_state_dict(state_dict)

    trainer.register_plugin(TrainingLossMonitor(
        smoothing=params['loss_smoothing']
    ))
    trainer.register_plugin(ValidationPlugin(
        data_loader('validation'),
        data_loader('test'),
        writer
    ))
    trainer.register_plugin(AbsoluteTimeMonitor())
    trainer.register_plugin(SaverPlugin(
        checkpoints_path, params['keep_old_checkpoints']
    ))

    trainer.register_plugin(
        Logger([
            'training_loss',
            'validation_loss',
            'test_loss',
            'time'
        ])
    )

    trainer.register_plugin(StatsPlugin(
        results_path,
        iteration_fields=[
            'training_loss',
            ('training_loss', 'running_avg'),
            'time'
        ],
        epoch_fields=[
            'validation_loss',
            'test_loss',
            'time'
        ],
        plots={
            'loss': {
                'x': 'iteration',
                'ys': [
                    'training_loss',
                    ('training_loss', 'running_avg'),
                    'validation_loss',
                    'test_loss',
                ],
                'log_y': True
            }
        }
    ))
    
    trainer.run(params['epoch_limit'])
Example #29
0
def train(args):

    # construct data loader
    train_data_set = WaiMaiDataSet(TRAIN_PATH, wv_embedding.word_to_id,
                                   args.max_len, args.use_unk)
    train_data_loader = DataLoader(train_data_set,
                                   batch_size=args.batch_size,
                                   shuffle=True)

    dev_data_set = WaiMaiDataSet(DEV_PATH, wv_embedding.word_to_id,
                                 args.max_len, args.use_unk)
    dev_data_loader = DataLoader(dev_data_set,
                                 batch_size=args.batch_size,
                                 shuffle=False)

    model = LSTMAttenModel(args.embedding_dim, args.hidden_dim,
                           args.vocab_size, args.num_layers, args.num_directs,
                           wv_embedding.embedding)

    model = model.to(DEVICE)

    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=args.weight_decay)
    # optimizer = optim.AdamW(model.parameters(), weight_decay=args.weight_decay)
    scheduler = MultiStepLR(optimizer, milestones=[10], gamma=0.1)

    criteon = nn.CrossEntropyLoss().to(DEVICE)

    train_loss_list = []
    val_loss_list = []
    train_acc_list = []
    val_acc_list = []

    # log process
    best_acc = 0
    for epoch in range(args.epochs):
        train_loss = []
        model.train()
        correct = 0
        for tokens, token_len, labels in train_data_loader:
            tokens, token_len, labels = tokens.to(DEVICE), token_len.to(
                DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            preds = model(tokens, token_len)

            loss = criteon(preds, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()

            train_loss.append(loss.item())
            correct += (preds.argmax(1) == labels).sum()
            torch.cuda.empty_cache()

        # learning rate decay
        scheduler.step()

        train_loss = np.array(train_loss).mean()
        train_acc = float(correct) / len(train_data_loader.dataset)
        val_loss, val_acc = evaluate(model, dev_data_loader, criteon)

        # if val_acc> best_acc:
        #     best_acc = val_acc
        torch.save(model.state_dict(),
                   SAVE_PATH + "epoch{}_{:.4f}.pt".format(epoch, val_acc))
        print('epochs:{},Training loss:{:4f}, Val loss:{:4f}'.format(
            epoch, train_loss, val_loss))
        print('Training acc:{:4f}, Val acc:{:4f}'.format(train_acc, val_acc))
        print()

        train_loss_list.append(train_loss)
        train_acc_list.append(train_acc)
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)

    plt.figure()
    plt.plot(train_loss_list)
    plt.plot(val_loss_list)
    plt.xlabel("epoch")
    plt.ylabel('loss')
    plt.legend(['train', 'val'])
    plt.show()

    plt.figure()
    plt.plot(train_acc_list)
    plt.plot(val_acc_list)
    plt.xlabel("epoch")
    plt.ylabel('accuracy')
    plt.legend(['train', 'val'])
    plt.show()
Example #30
0
    cnn = ResNet18(num_classes=num_classes)
elif args.model == 'wideresnet':
    if args.dataset == 'svhn':
        cnn = WideResNet(depth=16, num_classes=num_classes, widen_factor=8,
                         dropRate=0.4)
    else:
        cnn = WideResNet(depth=28, num_classes=num_classes, widen_factor=10,
                         dropRate=0.3)

cnn = cnn.cuda()
criterion = nn.CrossEntropyLoss().cuda()
cnn_optimizer = torch.optim.SGD(cnn.parameters(), lr=args.learning_rate,
                                momentum=0.9, nesterov=True, weight_decay=5e-4)

if args.dataset == 'svhn':
    scheduler = MultiStepLR(cnn_optimizer, milestones=[80, 120], gamma=0.1)
else:
    scheduler = MultiStepLR(cnn_optimizer, milestones=[60, 120, 160], gamma=0.2)

filename = 'logs/' + test_id + '.csv'
csv_logger = CSVLogger(args=args, fieldnames=['epoch', 'train_acc', 'test_acc'], filename=filename)


def test(loader):
    cnn.eval()    # Change model to 'eval' mode (BN uses moving mean/var).
    correct = 0.
    total = 0.
    for images, labels in loader:
        images = images.cuda()
        labels = labels.cuda()