Example #1
0
def main():
    best_acc = 0
    opt = parse_option()

    # build data loader
    train_loader, val_loader = set_loader(opt)

    # build model and criterion
    model, classifier, criterion = set_model(opt)

    # build optimizer
    # optimizer = set_optimizer(opt, [classifier])
    optimizer = set_optimizer(opt, [classifier, model])

    # training routine
    for epoch in range(1, opt.epochs + 1):
        adjust_learning_rate(opt, optimizer, epoch)

        # train for one epoch
        time1 = time.time()
        loss, acc = train(train_loader, model, classifier, criterion,
                          optimizer, epoch, opt)
        time2 = time.time()
        print('Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format(
            epoch, time2 - time1, acc))

        # eval for one epoch
        loss, val_acc = validate(val_loader, model, classifier, criterion, opt)
        if val_acc > best_acc:
            best_acc = val_acc

    print('best accuracy: {:.2f}'.format(best_acc))
Example #2
0
def main():
    best_acc = 0
    opt = parse_option()

    # build data loader
    train_loader, val_loader = set_loader(opt)

    # build model and criterion
    model, criterion = set_model(opt)

    # build optimizer
    optimizer = set_optimizer(opt, model)

    # tensorboard
    writer = SummaryWriter(log_dir=opt.tb_folder, flush_secs=2)

    # training routine
    for epoch in range(1, opt.epochs + 1):
        adjust_learning_rate(opt, optimizer, epoch)

        # train for one epoch
        time1 = time.time()
        loss, train_acc = train(train_loader, model, criterion, optimizer,
                                epoch, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        writer.add_scalar('train_loss', loss, global_step=epoch)
        writer.add_scalar('train_acc', train_acc, global_step=epoch)
        writer.add_scalar('learning_rate',
                          optimizer.param_groups[0]['lr'],
                          global_step=epoch)

        # evaluation
        loss, val_acc = validate(val_loader, model, criterion, opt)
        writer.add_scalar('val_loss', loss, global_step=epoch)
        writer.add_scalar('val_acc', val_acc, global_step=epoch)

        if val_acc > best_acc:
            best_acc = val_acc

        if epoch % opt.save_freq == 0:
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            save_model(model, optimizer, opt, epoch, save_file)

    # save the last model
    save_file = os.path.join(opt.save_folder, 'last.pth')
    save_model(model, optimizer, opt, opt.epochs, save_file)

    print('best accuracy: {:.2f}'.format(best_acc))
Example #3
0
def re_train():
	sent_analysis = SentimentAnalysis()
	#d_word_index, embed = sent_analysis.get_vocab()
	train_loader, val_loader = sent_analysis.get_trainer()
	model, criterion, optimizer = sent_analysis.get_model()

	for epoch in range(1, sent_analysis.epochs + 1):
		util.adjust_learning_rate(sent_analysis.lr, optimizer, epoch)
		train(train_loader, model, criterion, optimizer, epoch)
		#test(val_loader, model, criterion)

		if epoch % sent_analysis.sf == 0:
			name_model = 'rnn_{}.pkl'.format(epoch)
			path_save_model = os.path.join('./', name_model)
			joblib.dump(model.float(), path_save_model, compress = 2)
def main():
    best_acc = 0
    best_acc5 = 0
    opt = parse_option()

    # build data loader
    train_loader, val_loader = set_loader(opt)

    # build model and criterion
    model, classifier, criterion = set_model(opt)

    # build optimizer
    optimizer = set_optimizer(opt, classifier)

    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # training routine
    for epoch in range(1, opt.epochs + 1):
        adjust_learning_rate(opt, optimizer, epoch)

        # train for one epoch
        time1 = time.time()
        loss, acc, acc5 = train(train_loader, model, classifier, criterion,
                                optimizer, epoch, opt)
        time2 = time.time()
        logging.info(
            'Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format(
                epoch, time2 - time1, acc))

        logger.log_value('classifier/train_loss', loss, epoch)
        logger.log_value('classifier/train_acc1', acc, epoch)
        logger.log_value('classifier/train_acc5', acc5, epoch)

        # eval for one epoch
        loss, val_acc, val_acc5 = validate(val_loader, model, classifier,
                                           criterion, opt)
        logger.log_value('classifier/val_loss', loss, epoch)
        logger.log_value('classifier/val_acc1', val_acc, epoch)
        logger.log_value('classifier/val_acc5', val_acc5, epoch)
        if val_acc > best_acc:
            best_acc = val_acc
            best_acc5 = val_acc5

    logging.info('best accuracy: {:.2f}, accuracy5: {:.2f}'.format(
        best_acc, best_acc5))
Example #5
0
def main():
    best_acc = 0
    opt = parse_option()

    # build data loader
    train_loader, val_loader = set_loader(opt)

    # build model and criterion
    model, classifier, criterion = set_model(opt)

    # build optimizer
    optimizer = set_optimizer(opt, classifier)

    # tensorboard
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # training routine
    for epoch in range(1, opt.epochs + 1):
        adjust_learning_rate(opt, optimizer, epoch)

        # train for one epoch
        time1 = time.time()
        loss, acc = train(train_loader, model, classifier, criterion,
                          optimizer, epoch, opt)
        time2 = time.time()
        print('Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format(
            epoch, time2 - time1, acc))

        # eval for one epoch
        loss, val_acc = validate(val_loader, model, classifier, criterion, opt)
        if val_acc > best_acc:
            best_acc = val_acc

        # tensorboard logger
        logger.log_value('loss', loss, epoch)
        logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                         epoch)

        if epoch % opt.save_freq == 0:
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            save_model(model, optimizer, opt, epoch, save_file, classifier)

    print('best accuracy: {:.2f}'.format(best_acc))
Example #6
0
def main():
    best_acc = 0
    best_classifier = None
    opt = parse_option()

    # build data loader
    train_loader, val_loader = set_loader(opt)

    # build model and criterion
    model, classifier, criterion = set_model(opt)
    best_classifier = classifier

    # build optimizer
    optimizer = set_optimizer(opt, classifier)

    if opt.eval:
        loss, val_acc = validate(val_loader, model, classifier, criterion, opt)
    else:
        # training routine
        for epoch in range(1, opt.epochs + 1):
            adjust_learning_rate(opt, optimizer, epoch)

            # train for one epoch
            time1 = time.time()
            loss, acc = train(train_loader, model, classifier, criterion,
                              optimizer, epoch, opt)
            time2 = time.time()
            print('Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format(
                epoch, time2 - time1, acc))

            # eval for one epoch
            loss, val_acc = validate(val_loader, model, classifier, criterion,
                                     opt)
            if val_acc > best_acc:
                best_acc = val_acc
                best_classifier = classifier

        print('best accuracy: {:.2f}'.format(best_acc))

    for epsilon in opt.epsilons:
        loss, acc, adv_acc = adveval(val_loader, model, best_classifier,
                                     criterion, opt, epsilon)
        print('adv accuracy at epsilon {:.2f}: {:.2f}'.format(
            epsilon, adv_acc))
Example #7
0
def main():
    opt = parse_option()

    # build data loader
    train_loader = set_loader(opt)

    # build model and criterion
    model, criterion = set_model(opt)

    # build optimizer
    optimizer = set_optimizer(opt, model)

    # tensorboard
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # training routine
    for epoch in range(1, opt.epochs + 1):
        adjust_learning_rate(opt, optimizer, epoch)

        # train for one epoch
        time1 = time.time()
        loss = train(train_loader, model, criterion, optimizer, epoch, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('loss', loss, epoch)
        logger.log_value(
            'learning_rate', optimizer.param_groups[0]['lr'], epoch)

        if epoch % opt.save_freq == 0:
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            save_model(model, optimizer, opt, epoch, save_file)

    # save the last model
    save_file = os.path.join(
        opt.save_folder, 'last.pth')
    save_model(model, optimizer, opt, opt.epochs, save_file)
Example #8
0
def main():
    best_acc = 0

    opt = parse_option()
    wandb.init(project=opt.model_path.split("/")[-1], tags=opt.tags)
    wandb.config.update(opt)
    wandb.save('*.py')
    wandb.run.save()
        
        
    # dataloader
    train_loader, val_loader, meta_testloader, meta_valloader, n_cls, no_sample = get_dataloaders(opt)
    
    # model
    model_t = []
    if("," in opt.path_t):
        for path in opt.path_t.split(","):
            model_t.append(load_teacher(path, opt.model_t, n_cls, opt.dataset, opt.trans, opt.memfeature_size))
    else:
        model_t.append(load_teacher(opt.path_t, opt.model_t, n_cls, opt.dataset, opt.trans, opt.memfeature_size))
    
    model_s = create_model(opt.model_s, n_cls, opt.dataset, n_trans=opt.trans, embd_sz=opt.memfeature_size)
    if torch.cuda.device_count() > 1:
        print("second gpu count:", torch.cuda.device_count())
        model_s = nn.DataParallel(model_s)
    if opt.pretrained_path != "":
        model_s.load_state_dict(torch.load(opt.pretrained_path)['model'])
    wandb.watch(model_s)

    criterion_cls = nn.CrossEntropyLoss()
    criterion_div = DistillKL(opt.kd_T)
    criterion_kd = DistillKL(opt.kd_T)

    optimizer = optim.SGD(model_s.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    if torch.cuda.is_available():
        for m in model_t: 
            m.cuda()
        model_s.cuda()
        criterion_cls = criterion_cls.cuda()
        criterion_div = criterion_div.cuda()
        criterion_kd = criterion_kd.cuda()
        cudnn.benchmark = True
    
    MemBank = np.random.randn(no_sample, opt.memfeature_size)
    MemBank = torch.tensor(MemBank, dtype=torch.float).cuda()
    MemBankNorm = torch.norm(MemBank, dim=1, keepdim=True)
    MemBank = MemBank / (MemBankNorm + 1e-6)
           
    meta_test_acc = 0 
    meta_test_std = 0
    # routine: supervised model distillation
    for epoch in range(1, opt.epochs + 1):

        if opt.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss, MemBank = train(epoch, train_loader, model_s, model_t , criterion_cls, criterion_div, criterion_kd, optimizer, opt, MemBank)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        val_acc = 0
        val_loss = 0
        meta_val_acc = 0
        meta_val_std = 0
#         val_acc, val_acc_top5, val_loss = validate(val_loader, model_s, criterion_cls, opt)
        
        
#         #evaluate
#         start = time.time()
#         meta_val_acc, meta_val_std = meta_test(model_s, meta_valloader)
#         test_time = time.time() - start
#         print('Meta Val Acc: {:.4f}, Meta Val std: {:.4f}, Time: {:.1f}'.format(meta_val_acc, meta_val_std, test_time))
        
        #evaluate
        start = time.time()
        meta_test_acc, meta_test_std = 0,0 #meta_test(model_s, meta_testloader, use_logit=False)
        test_time = time.time() - start
        print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}, Time: {:.1f}'.format(meta_test_acc, meta_test_std, test_time))
        
        
        # regular saving
        if epoch % opt.save_freq == 0 or epoch==opt.epochs:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
            }            
            save_file = os.path.join(opt.save_folder, 'model_'+str(wandb.run.name)+'.pth')
            torch.save(state, save_file)
            
            #wandb saving
            torch.save(state, os.path.join(wandb.run.dir, "model.pth"))
        
        wandb.log({'epoch': epoch, 
                   'Train Acc': train_acc,
                   'Train Loss':train_loss,
                   'Val Acc': val_acc,
                   'Val Loss':val_loss,
                   'Meta Test Acc': meta_test_acc,
                   'Meta Test std': meta_test_std,
                   'Meta Val Acc': meta_val_acc,
                   'Meta Val std': meta_val_std
                  })        
        
    #final report
    print("GENERATING FINAL REPORT")
    generate_final_report(model_s, opt, wandb)
    
    #remove output.txt log file 
    output_log_file = os.path.join(wandb.run.dir, "output.log")
    if os.path.isfile(output_log_file):
        os.remove(output_log_file)
    else:    ## Show an error ##
        print("Error: %s file not found" % output_log_file)
def main():

    global best_acc1
    best_acc1 = 0

    args = parse_option()

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # set the data loader
    train_folder = os.path.join(args.data_folder, 'train')
    val_folder = os.path.join(args.data_folder, 'val')

    image_size = 224
    crop_padding = 32
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    normalize = transforms.Normalize(mean=mean, std=std)

    if args.aug == 'NULL':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    elif args.aug == 'CJ':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        raise NotImplemented('augmentation not supported: {}'.format(args.aug))

    train_dataset = datasets.ImageFolder(train_folder, train_transform)
    val_dataset = datasets.ImageFolder(
        val_folder,
        transforms.Compose([
            transforms.Resize(image_size + crop_padding),
            transforms.CenterCrop(image_size),
            transforms.ToTensor(),
            normalize,
        ]))

    print(len(train_dataset))
    train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.num_workers,
                                               pin_memory=True,
                                               sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.num_workers,
                                             pin_memory=True)

    # create model and optimizer
    if args.model == 'resnet50':
        model = InsResNet50()
        classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 1)
    elif args.model == 'resnet50x2':
        model = InsResNet50(width=2)
        classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 2)
    elif args.model == 'resnet50x4':
        model = InsResNet50(width=4)
        classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 4)
    else:
        raise NotImplementedError('model not supported {}'.format(args.model))

    print('==> loading pre-trained model')
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['model'])
    print("==> loaded checkpoint '{}' (epoch {})".format(
        args.model_path, ckpt['epoch']))
    print('==> done')

    model = model.cuda()
    classifier = classifier.cuda()

    criterion = torch.nn.CrossEntropyLoss().cuda(args.gpu)

    if not args.adam:
        optimizer = torch.optim.SGD(classifier.parameters(),
                                    lr=args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.Adam(classifier.parameters(),
                                     lr=args.learning_rate,
                                     betas=(args.beta1, args.beta2),
                                     weight_decay=args.weight_decay,
                                     eps=1e-8)

    model.eval()
    cudnn.benchmark = True

    # set mixed precision training
    # if args.amp:
    #     model = amp.initialize(model, opt_level=args.opt_level)
    #     classifier, optimizer = amp.initialize(classifier, optimizer, opt_level=args.opt_level)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            # checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_acc1 = checkpoint['best_acc1']
            best_acc1 = best_acc1.cuda()
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            if 'opt' in checkpoint.keys():
                # resume optimization hyper-parameters
                print('=> resume hyper parameters')
                if 'bn' in vars(checkpoint['opt']):
                    print('using bn: ', checkpoint['opt'].bn)
                if 'adam' in vars(checkpoint['opt']):
                    print('using adam: ', checkpoint['opt'].adam)
                if 'cosine' in vars(checkpoint['opt']):
                    print('using cosine: ', checkpoint['opt'].cosine)
                args.learning_rate = checkpoint['opt'].learning_rate
                # args.lr_decay_epochs = checkpoint['opt'].lr_decay_epochs
                args.lr_decay_rate = checkpoint['opt'].lr_decay_rate
                args.momentum = checkpoint['opt'].momentum
                args.weight_decay = checkpoint['opt'].weight_decay
                args.beta1 = checkpoint['opt'].beta1
                args.beta2 = checkpoint['opt'].beta2
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # set cosine annealing scheduler
    if args.cosine:

        # last_epoch = args.start_epoch - 2
        # eta_min = args.learning_rate * (args.lr_decay_rate ** 3) * 0.1
        # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min, last_epoch)

        eta_min = args.learning_rate * (args.lr_decay_rate**3) * 0.1
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.epochs, eta_min, -1)
        # dummy loop to catch up with current epoch
        for i in range(1, args.start_epoch):
            scheduler.step()

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        if args.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_acc5, train_loss = train(epoch, train_loader, model,
                                                  classifier, criterion,
                                                  optimizer, args)
        time2 = time.time()
        print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_acc5', train_acc5, epoch)
        logger.log_value('train_loss', train_loss, epoch)
        logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                         epoch)

        print("==> testing...")
        test_acc, test_acc5, test_loss = validate(val_loader, model,
                                                  classifier, criterion, args)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc5', test_acc5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc1:
            best_acc1 = test_acc
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }
            save_name = '{}_layer{}.pth'.format(args.model, args.layer)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving best model!')
            torch.save(state, save_name)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': test_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving regular model!')
            torch.save(state, save_name)

        # tensorboard logger
        pass
Example #10
0
def main():

    args = parse_option()

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # set the data loader
    data_folder = os.path.join(args.data_folder, 'train')

    image_size = 224
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    normalize = transforms.Normalize(mean=mean, std=std)

    if args.aug == 'NULL':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    elif args.aug == 'CJ':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        raise NotImplemented('augmentation not supported: {}'.format(args.aug))

    train_dataset = ImageFolderInstance(data_folder,
                                        transform=train_transform,
                                        two_crop=args.moco)
    print(len(train_dataset))
    train_sampler = None
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.num_workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    # create model and optimizer
    n_data = len(train_dataset)

    if args.model == 'resnet50':
        model = InsResNet50()
        if args.moco:
            model_ema = InsResNet50()
    elif args.model == 'resnet50x2':
        model = InsResNet50(width=2)
        if args.moco:
            model_ema = InsResNet50(width=2)
    elif args.model == 'resnet50x4':
        model = InsResNet50(width=4)
        if args.moco:
            model_ema = InsResNet50(width=4)
    else:
        raise NotImplementedError('model not supported {}'.format(args.model))

    # copy weights from `model' to `model_ema'
    if args.moco:
        moment_update(model, model_ema, 0)

    # set the contrast memory and criterion
    if args.moco:
        contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t,
                              args.softmax).cuda(args.gpu)
    else:
        contrast = MemoryInsDis(128, n_data, args.nce_k, args.nce_t,
                                args.nce_m, args.softmax).cuda(args.gpu)

    criterion = NCESoftmaxLoss() if args.softmax else NCECriterion(n_data)
    criterion = criterion.cuda(args.gpu)

    model = model.cuda()
    if args.moco:
        model_ema = model_ema.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)
        if args.moco:
            optimizer_ema = torch.optim.SGD(model_ema.parameters(),
                                            lr=0,
                                            momentum=0,
                                            weight_decay=0)
            model_ema, optimizer_ema = amp.initialize(model_ema,
                                                      optimizer_ema,
                                                      opt_level=args.opt_level)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            # checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            contrast.load_state_dict(checkpoint['contrast'])
            if args.moco:
                model_ema.load_state_dict(checkpoint['model_ema'])

            if args.amp and checkpoint['opt'].amp:
                print('==> resuming amp state_dict')
                amp.load_state_dict(checkpoint['amp'])

            print("=> loaded successfully '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        if args.moco:
            loss, prob = train_moco(epoch, train_loader, model, model_ema,
                                    contrast, criterion, optimizer, args)
        else:
            loss, prob = train_ins(epoch, train_loader, model, contrast,
                                   criterion, optimizer, args)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('ins_loss', loss, epoch)
        logger.log_value('ins_prob', prob, epoch)
        logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                         epoch)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': model.state_dict(),
                'contrast': contrast.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }
            if args.moco:
                state['model_ema'] = model_ema.state_dict()
            if args.amp:
                state['amp'] = amp.state_dict()
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            # help release GPU memory
            del state

        # saving the model
        print('==> Saving...')
        state = {
            'opt': args,
            'model': model.state_dict(),
            'contrast': contrast.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch,
        }
        if args.moco:
            state['model_ema'] = model_ema.state_dict()
        if args.amp:
            state['amp'] = amp.state_dict()
        save_file = os.path.join(args.model_folder, 'current.pth')
        torch.save(state, save_file)
        if epoch % args.save_freq == 0:
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
        # help release GPU memory
        del state
        torch.cuda.empty_cache()
Example #11
0
            # save whole model (including stylebank)
            torch.save(model.state_dict(), args.MODEL_WEIGHT_PATH)
            # save seperate part
            with open(args.GLOBAL_STEP_PATH, 'w') as f:
                f.write(str(global_step))
            torch.save(model.encoder_net.state_dict(),
                       args.ENCODER_WEIGHT_PATH)
            torch.save(model.decoder_net.state_dict(),
                       args.DECODER_WEIGHT_PATH)
            for i in range(len(style_dataset)):
                torch.save(model.style_bank[i].state_dict(),
                           args.BANK_WEIGHT_PATH.format(i))

        if global_step % args.ADJUST_LR_ITER == 0:
            lr_step = global_step / args.ADJUST_LR_ITER
            util.adjust_learning_rate(optimizer, lr_step)
            new_lr = util.adjust_learning_rate(optimizer_ae, lr_step)

            print("learning rate decay:", new_lr)

# In[13]:
"""
Testing
"""
#for i, data in enumerate(content_dataloader, 0):
#    data = data[0].to(device)
#    batch_size = data.shape[0]
##     data = data[0].repeat(batch_size, 1, 1, 1)
#    for j in range(batch_size):
#        util.showimg(data[j].cpu())
#
Example #12
0
print(model)
print(optimizer)
print(criterion)

if args['cuda']:
    torch.backends.cudnn.enabled = True
    cudnn.benchmark = True
    model.cuda()
    criterion = criterion.cuda()

# training and testing
start_time = time.time()
# 创建文件写控制器,将之后的数值以protocol buffer格式写入到logs文件夹中,空的logs文件夹将被自动创建。
writer = SummaryWriter()
for epoch in range(1, args['epochs'] + 1):
    util.adjust_learning_rate(args['learning_rate'], optimizer, epoch)
    if args['model'] == 'grn16':
        common.train_grn16(train_loader, model, criterion, optimizer, epoch,
                           args['cuda'], args['clip'], args['print_freq'])
        common.test_grn16(val_loader, model, criterion, args['cuda'],
                          args['print_freq'])
    elif args['model'] == 'keann':
        common.train_keann(train_loader, model, criterion, optimizer, epoch,
                           args['cuda'], args['clip'], args['print_freq'])
        common.test_keann(val_loader, model, criterion, args['cuda'],
                          args['print_freq'], args['pdtb_category'])
    elif args['model'] == 'keann_kg':
        common.train_keann_kg(train_loader, model, criterion, optimizer, epoch,
                              args['cuda'], args['clip'], args['print_freq'],
                              writer)
        common.test_keann_kg(val_loader, model, criterion, args['cuda'],
def train_net():
    annList = [
        '../data/train/Annotations/blouse.csv',
        '../data/train/Annotations/dress.csv',
        '../data/train/Annotations/outwear.csv',
        '../data/train/Annotations/skirt.csv',
        '../data/train/Annotations/trousers.csv'
    ]
    classNumList = [13, 15, 14, 4, 7]
    index_array = [[2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16],
                   [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20],
                   [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20], [17, 18, 21, 22, 23, 24, 25]]

    paramsNameList = ['blouse', 'dress', 'outwear', 'skirt', 'trousers']
    modelSaveList = [
        '../saveparameter/blouse/', '../saveparameter/dress/',
        '../saveparameter/outwear/', '../saveparameter/skirt/',
        '../saveparameter/trousers/'
    ]
    paramsOldList = [
        '../saveparameter/blouse/3000res50.pth.tar',
        '../saveparameter/dress/15000new2.pth.tar',
        '../saveparameter/outwear/10000new2.pth.tar',
        '../saveparameter/skirt/5000new2.pth.tar',
        '/home/tanghm/Documents/YFF/project/saveparameter/trousers/15000new2.pth.tar'
    ]
    for idx in range(0, 1):
        #打印当前训练的服饰类别
        print('train' + paramsNameList[idx])
        #该服饰一共需要预测多少个关键点
        numpoints = classNumList[idx]
        #构建模型
        model = construct_model(numpoints)
        state_dict = torch.load(paramsOldList[idx])['state_dict']
        model.load_state_dict(state_dict)
        # lable文件的路径
        ann_path = annList[idx]
        #图像所在路径
        img_dir = '../data/train/'

        stride = 8
        cudnn.benchmark = True
        config = util.Config('./config.yml')
        #构建训练的数据
        train_loader = torch.utils.data.DataLoader(
            dataset_loader.dataset_loader(numpoints,
                                          img_dir,
                                          ann_path,
                                          stride,
                                          Mytransforms.Compose([
                                              Mytransforms.RandomResized(),
                                              Mytransforms.RandomRotate(40),
                                              Mytransforms.RandomCrop(384),
                                          ]),
                                          sigma=15),
            batch_size=config.batch_size,
            shuffle=True,
            num_workers=config.workers,
            pin_memory=True)
        #网络的loss函数类型
        if (torch.cuda.is_available()):
            criterion = nn.MSELoss().cuda()
        params = []
        for key, value in model.named_parameters():
            if value.requires_grad != False:
                params.append({'params': value, 'lr': config.base_lr})

        # optimizer = torch.optim.SGD(params, config.base_lr, momentum=config.momentum,
        #                             weight_decay=config.weight_decay)
        optimizer = torch.optim.Adam(params,
                                     lr=config.base_lr,
                                     betas=(0.9, 0.99),
                                     weight_decay=config.weight_decay)
        # model.train() # only for bn and dropout
        model.eval()

        # from matplotlib import pyplot as plt

        iters = 0
        batch_time = util.AverageMeter()
        data_time = util.AverageMeter()
        losses = util.AverageMeter()
        losses_list = [util.AverageMeter() for i in range(12)]
        end = time.time()

        heat_weight = 48 * 48 * (
            classNumList[idx] +
            1) / 2.0  # for convenient to compare with origin code
        # heat_weight = 1

        while iters < config.max_iter:
            #input 表示图片,heatmap表示网络输出值
            for i, (input, heatmap) in enumerate(train_loader):
                learning_rate = util.adjust_learning_rate(optimizer, iters, config.base_lr, policy=config.lr_policy, \
                                                          policy_parameter=config.policy_parameter)
                data_time.update(time.time() - end)
                if (torch.cuda.is_available()):
                    input = input.cuda(async=True)
                    heatmap = heatmap.cuda(async=True)
                input_var = torch.autograd.Variable(input)
                heatmap_var = torch.autograd.Variable(heatmap)
                #将图像进行tensor和Variable转化后喂进模型
                heat = model(input_var)

                # feat = C4.cpu().data.numpy()
                # for n in range(100):
                #     plt.subplot(10, 10, n + 1);
                #     plt.imshow(feat[0, n, :, :], cmap='gray')
                #     plt.xticks([]);
                #     plt.yticks([])
                # plt.show()

                loss1 = criterion(heat, heatmap_var) * heat_weight
                # loss2 = criterion(heat4, heatmap_var) * heat_weight
                # loss3 = criterion(heat5, heatmap_var) * heat_weight
                # loss4 = criterion(heat6, heatmap_var) * heat_weight
                # loss5 = criterion(heat, heatmap_var)
                # loss6 = criterion(heat, heatmap_var)

                loss = loss1  # + loss2 + loss3# + loss4# + loss5 + loss6
                losses.update(loss.data[0], input.size(0))
                loss_list = [loss1
                             ]  # , loss2, loss3]# , loss4 ]# , loss5 , loss6]
                for cnt, l in enumerate(loss_list):
                    losses_list[cnt].update(l.data[0], input.size(0))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                batch_time.update(time.time() - end)
                end = time.time()

                iters += 1
                if iters % config.display == 0:
                    print(
                        'Train Iteration: {0}\t'
                        'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                        'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                        'Learning rate = {2}\n'
                        'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.
                        format(iters,
                               config.display,
                               learning_rate,
                               batch_time=batch_time,
                               data_time=data_time,
                               loss=losses))
                    for cnt in range(0, 1):
                        print(
                            'Loss{0}_1 = {loss1.val:.8f} (ave = {loss1.avg:.8f})'
                            .format(cnt + 1, loss1=losses_list[cnt]))
                    print(
                        time.strftime(
                            '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
                            time.localtime()))

                    batch_time.reset()
                    data_time.reset()
                    losses.reset()
                    for cnt in range(12):
                        losses_list[cnt].reset()

                if iters % 1000 == 0:
                    torch.save(
                        {
                            'iter': iters,
                            'state_dict': model.state_dict(),
                        }, modelSaveList[idx] + str(iters) + 'res50.pth.tar')
                    with open('./logLoss2.txt', 'a') as f:
                        f.write(
                            'Train Iteration: {0}\t'
                            'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                            'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                            'Learning rate = {2}\n'
                            'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.
                            format(iters,
                                   config.display,
                                   learning_rate,
                                   batch_time=batch_time,
                                   data_time=data_time,
                                   loss=losses) + '\n')

                if iters == config.max_iter:
                    break

    return
Example #14
0

highestScore = 0
tsid = 0
name_model = 'parser_model2.pt'
path_save_model = os.path.join('gen', name_model)
for epoch in range(1, args.epochs+1):

    for i, (word_tensor, ext_word_ids,char_ids,pos_tensor,xpos_tensor,head_targets,rel_targets,seq_lengths,perm_idx) in enumerate(train_loader):

        start = time.time()
        # switch to train mode
        model.train()
        ts = (((epoch -1) * train_loader.n_batches) + (i+1))
        if (ts%5000 == 0):
            adjust_learning_rate(args.lr, optimizer,optimizer_sparse)

        if args.cuda:
            word_tensor = word_tensor.cuda()
            pos_tensor = pos_tensor.cuda()
            xpos_tensor = xpos_tensor.cuda()
            head_targets = head_targets.cuda()
            rel_targets = rel_targets.cuda()

        # compute output
        arc_logits,label_logits = model(word_tensor,ext_word_ids,char_ids,pos_tensor,xpos_tensor,seq_lengths)
        arc_logits = arc_logits[:,1:,:]
        label_logits = label_logits[:,1:,:,:]
        head_targets = head_targets.view(-1)
        rel_targets = rel_targets.view(-1)
        s_arc_scores, s_arc_indices = torch.max(arc_logits, 2)
Example #15
0
def main():

    # parse the args
    args = parse_option()

    # set the loader
    train_loader, n_data = get_train_loader(args)

    # set the model
    model, contrast, criterion_ab, criterion_l = set_model(args, n_data)

    # set the optimizer
    optimizer = set_optimizer(args, model)

    # set mixed precision
    if args.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            contrast.load_state_dict(checkpoint['contrast'])
            if args.amp and checkpoint['opt'].amp:
                print('==> resuming amp state_dict')
                amp.load_state_dict(checkpoint['amp'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        l_loss, l_prob, ab_loss, ab_prob = train(epoch, train_loader, model, contrast, criterion_l, criterion_ab,
                                                 optimizer, args)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('l_loss', l_loss, epoch)
        logger.log_value('l_prob', l_prob, epoch)
        logger.log_value('ab_loss', ab_loss, epoch)
        logger.log_value('ab_prob', ab_prob, epoch)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': model.state_dict(),
                'contrast': contrast.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }
            if args.amp:
                state['amp'] = amp.state_dict()
            save_file = os.path.join(args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            # help release GPU memory
            del state

        torch.cuda.empty_cache()
Example #16
0
def train(args):

    # Device, save and log configuration

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    save_dir = Path(os.path.join(args.save_dir, args.name))
    save_dir.mkdir(exist_ok=True, parents=True)
    log_dir = Path(os.path.join(args.log_dir, args.name))
    log_dir.mkdir(exist_ok=True, parents=True)
    writer = SummaryWriter(log_dir=str(log_dir))

    # Prepare datasets

    content_dataset = TrainDataset(args.content_dir, args.img_size)
    texture_dataset = TrainDataset(args.texture_dir,
                                   args.img_size,
                                   gray_only=True)
    color_dataset = TrainDataset(args.color_dir, args.img_size)

    content_iter = iter(
        data.DataLoader(content_dataset,
                        batch_size=args.batch_size,
                        sampler=InfiniteSamplerWrapper(content_dataset),
                        num_workers=args.n_threads))
    texture_iter = iter(
        data.DataLoader(texture_dataset,
                        batch_size=args.batch_size,
                        sampler=InfiniteSamplerWrapper(texture_dataset),
                        num_workers=args.n_threads))
    color_iter = iter(
        data.DataLoader(color_dataset,
                        batch_size=args.batch_size,
                        sampler=InfiniteSamplerWrapper(color_dataset),
                        num_workers=args.n_threads))

    # Prepare network

    network = Net(args)
    network.train()
    network.to(device)

    # Training options

    opt_L = torch.optim.Adam(network.L_path.parameters(), lr=args.lr)
    opt_AB = torch.optim.Adam(network.AB_path.parameters(), lr=args.lr)

    opts = [opt_L, opt_AB]

    # Start Training

    for i in tqdm(range(args.max_iter)):
        # S1: Adjust lr and prepare data

        adjust_learning_rate(opts, iteration_count=i, args=args)

        content_l, content_ab = [x.to(device) for x in next(content_iter)]
        texture_l = next(texture_iter).to(device)
        color_l, color_ab = [x.to(device) for x in next(color_iter)]

        # S2: Forward

        l_pred, ab_pred = network(content_l, content_ab, texture_l, color_ab)

        # S3: Calculate loss

        loss_ct, loss_t = network.ct_t_loss(l_pred, content_l, texture_l)
        loss_cr = network.cr_loss(ab_pred, color_ab)

        loss_ctw = args.content_weight * loss_ct
        loss_tw = args.texture_weight * loss_t
        loss_crw = args.color_weight * loss_cr

        loss = loss_ctw + loss_tw + loss_crw

        # S4: Backward

        for opt in opts:
            opt.zero_grad()
        loss.backward()
        for opt in opts:
            opt.step()

        # S5: Summary loss and save subnets

        writer.add_scalar('loss_content', loss_ct.item(), i + 1)
        writer.add_scalar('loss_texture', loss_t.item(), i + 1)
        writer.add_scalar('loss_color', loss_cr.item(), i + 1)

        if (i + 1) % args.save_model_interval == 0 or (i + 1) == args.max_iter:
            state_dict = network.state_dict()
            for key in state_dict.keys():
                state_dict[key] = state_dict[key].to(torch.device('cpu'))
            torch.save(state_dict,
                       save_dir / 'network_iter_{:d}.pth.tar'.format(i + 1))
    writer.close()
Example #17
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    best_acc1 = 0

    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    # set the model
    model, classifier, criterion = set_model(args, ngpus_per_node)

    # set optimizer
    optimizer = set_optimizer(args, classifier)

    cudnn.benchmark = True

    # optionally resume linear classifier
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # set the data loader
    train_loader, val_loader, train_sampler = get_train_val_loader(args)

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        if args.distributed:
            train_sampler.set_epoch(epoch)

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, model, classifier,
                                      criterion, optimizer, args)
        time2 = time.time()
        print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        print("==> testing...")
        test_acc, test_loss = validate(val_loader, model, classifier,
                                       criterion, args)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc1:
            best_acc1 = test_acc
            if not args.multiprocessing_distributed or (
                    args.multiprocessing_distributed
                    and args.rank % ngpus_per_node == 0):
                state = {
                    'epoch': epoch,
                    'classifier': classifier.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }
                save_name = '{}_layer{}.pth'.format(args.model, args.layer)
                save_name = os.path.join(args.save_folder, save_name)
                print('saving model!')
                torch.save(state, save_name)

        # regular save
        if not args.multiprocessing_distributed or \
                (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
            if epoch % args.save_freq == 0:
                print('==> Saving...')
                state = {
                    'epoch': epoch,
                    'classifier': classifier.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }
                save_file = os.path.join(
                    args.save_folder,
                    'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
                torch.save(state, save_file)

        # tensorboard logger
        pass
Example #18
0
def main():
    global best_acc1
    best_acc1 = 0

    args = parse_option()

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # set the data loader
    train_loader, val_loader, train_sampler = get_train_val_loader(args)

    # set the model
    model, classifier, criterion = set_model(args)

    # set optimizer
    optimizer = set_optimizer(args, classifier)

    cudnn.benchmark = True

    # optionally resume linear classifier
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch'] + 1
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_acc1 = checkpoint['best_acc1']
            best_acc1 = best_acc1.cuda()
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_acc5, train_loss = train(epoch, train_loader, model,
                                                  classifier, criterion,
                                                  optimizer, args)
        time2 = time.time()
        print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_acc5', train_acc5, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        print("==> testing...")
        test_acc, test_acc5, test_loss = validate(val_loader, model,
                                                  classifier, criterion, args)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc5', test_acc5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc1:
            best_acc1 = test_acc
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }
            save_name = '{}_layer{}.pth'.format(args.model, args.layer)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving best model!')
            torch.save(state, save_name)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': test_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving regular model!')
            torch.save(state, save_name)

        # tensorboard logger
        pass
Example #19
0
def train(current_gpu, args):
    best_acc1 = -1
    model_history = {}
    model_history = util.init_modelhistory(model_history)
    train_start = time.time()

    ## choose model from pytorch model_zoo
    model = util.torch_model(args.model_name, pretrained=True)
    loss_fn = nn.CrossEntropyLoss().cuda()

    ## distributed_setting
    model, args = dis_util.dist_setting(current_gpu, model, loss_fn, args)

    ## CuDNN library will benchmark several algorithms and pick that which it found to be fastest
    cudnn.benchmark = False if args.seed else True

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    if args.apex:
        model, optimizer = dis_util.apex_init(model, optimizer, args)


#     args.collate_fn = partial(dis_util.fast_collate, memory_format=args.memory_format)

    args = _get_images(args, data_type='train')
    train_loader, train_sampler = _get_train_data_loader(args, **args.kwargs)
    test_loader = _get_test_data_loader(args, **args.kwargs)

    logger.info("Processes {}/{} ({:.0f}%) of train data".format(
        len(train_loader.sampler), len(train_loader.dataset),
        100. * len(train_loader.sampler) / len(train_loader.dataset)))

    logger.info("Processes {}/{} ({:.0f}%) of test data".format(
        len(test_loader.sampler), len(test_loader.dataset),
        100. * len(test_loader.sampler) / len(test_loader.dataset)))

    for epoch in range(1, args.num_epochs + 1):
        ##
        batch_time = util.AverageMeter('Time', ':6.3f')
        data_time = util.AverageMeter('Data', ':6.3f')
        losses = util.AverageMeter('Loss', ':.4e')
        top1 = util.AverageMeter('Acc@1', ':6.2f')
        top5 = util.AverageMeter('Acc@5', ':6.2f')
        progress = util.ProgressMeter(
            len(train_loader), [batch_time, data_time, losses, top1, top5],
            prefix="Epoch: [{}]".format(epoch))

        trn_loss = []
        model.train()
        end = time.time()
        running_loss = 0.0
        ## Set epoch count for DistributedSampler
        if args.multigpus_distributed:
            train_sampler.set_epoch(epoch)

        prefetcher = util.data_prefetcher(train_loader)
        input, target = prefetcher.next()
        batch_idx = 0
        while input is not None:

            batch_idx += 1

            if args.prof >= 0 and batch_idx == args.prof:
                print("Profiling begun at iteration {}".format(batch_idx))
                torch.cuda.cudart().cudaProfilerStart()

            if args.prof >= 0:
                torch.cuda.nvtx.range_push(
                    "Body of iteration {}".format(batch_idx))

            util.adjust_learning_rate(optimizer, epoch, batch_idx,
                                      len(train_loader), args)

            ##### DATA Processing #####
            targets_gra = target[:, 0]
            targets_vow = target[:, 1]
            targets_con = target[:, 2]

            # 50%의 확률로 원본 데이터 그대로 사용
            if np.random.rand() < 0.5:
                logits = model(input)
                grapheme = logits[:, :168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]

                loss1 = loss_fn(grapheme, targets_gra)
                loss2 = loss_fn(vowel, targets_vow)
                loss3 = loss_fn(cons, targets_con)

            else:

                lam = np.random.beta(1.0, 1.0)
                rand_index = torch.randperm(input.size()[0])
                shuffled_targets_gra = targets_gra[rand_index]
                shuffled_targets_vow = targets_vow[rand_index]
                shuffled_targets_con = targets_con[rand_index]

                bbx1, bby1, bbx2, bby2 = _rand_bbox(input.size(), lam)
                input[:, :, bbx1:bbx2, bby1:bby2] = input[rand_index, :,
                                                          bbx1:bbx2, bby1:bby2]
                # 픽셀 비율과 정확히 일치하도록 lambda 파라메터 조정
                lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) /
                           (input.size()[-1] * input.size()[-2]))

                logits = model(input)
                grapheme = logits[:, :168]
                vowel = logits[:, 168:179]
                cons = logits[:, 179:]

                loss1 = loss_fn(grapheme, targets_gra) * lam + loss_fn(
                    grapheme, shuffled_targets_gra) * (1. - lam)
                loss2 = loss_fn(vowel, targets_vow) * lam + loss_fn(
                    vowel, shuffled_targets_vow) * (1. - lam)
                loss3 = loss_fn(cons, targets_con) * lam + loss_fn(
                    cons, shuffled_targets_con) * (1. - lam)

            loss = 0.5 * loss1 + 0.25 * loss2 + 0.25 * loss3
            trn_loss.append(loss.item())
            running_loss += loss.item()

            #########################################################

            # compute gradient and do SGD step
            optimizer.zero_grad()

            if args.apex:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()
            # Printing vital information
            if (batch_idx + 1) % (args.log_interval) == 0:
                s = f'[Epoch {epoch} Batch {batch_idx+1}/{len(train_loader)}] ' \
                f'loss: {running_loss / args.log_interval:.4f}'
                print(s)
                running_loss = 0

            if True or batch_idx % args.log_interval == 0:
                # Every log_interval iterations, check the loss, accuracy, and speed.
                # For best performance, it doesn't make sense to print these metrics every
                # iteration, since they incur an allreduce and some host<->device syncs.

                # Measure accuracy
                prec1, prec5 = util.accuracy(logits, target, topk=(1, 5))

                # Average loss and accuracy across processes for logging
                if args.multigpus_distributed:
                    reduced_loss = dis_util.reduce_tensor(loss.data, args)
                    prec1 = dis_util.reduce_tensor(prec1, args)
                    prec5 = dis_util.reduce_tensor(prec5, args)
                else:
                    reduced_loss = loss.data

                # to_python_float incurs a host<->device sync
                losses.update(to_python_float(reduced_loss), input.size(0))
                top1.update(to_python_float(prec1), input.size(0))
                top5.update(to_python_float(prec5), input.size(0))

                ## Waiting until finishing operations on GPU (Pytorch default: async)
                torch.cuda.synchronize()
                batch_time.update((time.time() - end) / args.log_interval)
                end = time.time()

                if current_gpu == 0:
                    print(
                        'Epoch: [{0}][{1}/{2}]  '
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
                        'Speed {3:.3f} ({4:.3f})  '
                        'Loss {loss.val:.10f} ({loss.avg:.4f})  '
                        'Prec@1 {top1.val:.3f} ({top1.avg:.3f})  '
                        'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                            epoch,
                            batch_idx,
                            len(train_loader),
                            args.world_size * args.batch_size / batch_time.val,
                            args.world_size * args.batch_size / batch_time.avg,
                            batch_time=batch_time,
                            loss=losses,
                            top1=top1,
                            top5=top5))
                    model_history['epoch'].append(epoch)
                    model_history['batch_idx'].append(batch_idx)
                    model_history['batch_time'].append(batch_time.val)
                    model_history['losses'].append(losses.val)
                    model_history['top1'].append(top1.val)
                    model_history['top5'].append(top5.val)

            input, target = prefetcher.next()

        acc1 = validate(test_loader, model, loss_fn, epoch, model_history,
                        trn_loss, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multigpus_distributed or (args.multigpus_distributed and
                                              args.rank % args.num_gpus == 0):
            util.save_history(
                os.path.join(args.output_data_dir, 'model_history.p'),
                model_history)

            util.save_model(
                {
                    'epoch': epoch + 1,
                    'model_name': args.model_name,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                    #                 'class_to_idx' : train_loader.dataset.class_to_idx,
                },
                is_best,
                args.model_dir)
Example #20
0
def main():
    best_acc = 0

    opt = parse_option()

    # tensorboard logger
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # dataloader
    train_partition = 'trainval' if opt.use_trainval else 'train'
    if opt.dataset == 'miniImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        if opt.distill in ['contrast']:
            train_set = ImageNet(args=opt,
                                 partition=train_partition,
                                 transform=train_trans,
                                 is_sample=True,
                                 k=opt.nce_k)
        else:
            train_set = ImageNet(args=opt,
                                 partition=train_partition,
                                 transform=train_trans)
        n_data = len(train_set)
        train_loader = DataLoader(train_set,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(ImageNet(args=opt,
                                         partition='val',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaImageNet(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaImageNet(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            n_cls = 64
    elif opt.dataset == 'tieredImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        if opt.distill in ['contrast']:
            train_set = TieredImageNet(args=opt,
                                       partition=train_partition,
                                       transform=train_trans,
                                       is_sample=True,
                                       k=opt.nce_k)
        else:
            train_set = TieredImageNet(args=opt,
                                       partition=train_partition,
                                       transform=train_trans)
        n_data = len(train_set)
        train_loader = DataLoader(train_set,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(TieredImageNet(args=opt,
                                               partition='train_phase_val',
                                               transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='test',
            train_transform=train_trans,
            test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='val',
            train_transform=train_trans,
            test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 448
        else:
            n_cls = 351
    elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100':
        train_trans, test_trans = transforms_options['D']
        if opt.distill in ['contrast']:
            train_set = CIFAR100(args=opt,
                                 partition=train_partition,
                                 transform=train_trans,
                                 is_sample=True,
                                 k=opt.nce_k)
        else:
            train_set = CIFAR100(args=opt,
                                 partition=train_partition,
                                 transform=train_trans)
        n_data = len(train_set)
        train_loader = DataLoader(train_set,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(CIFAR100(args=opt,
                                         partition='train',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaCIFAR100(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaCIFAR100(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            if opt.dataset == 'CIFAR-FS':
                n_cls = 64
            elif opt.dataset == 'FC100':
                n_cls = 60
            else:
                raise NotImplementedError('dataset not supported: {}'.format(
                    opt.dataset))
    else:
        raise NotImplementedError(opt.dataset)

    # model
    model_t = load_teacher(opt.path_t, n_cls, opt.dataset)
    model_s = create_model(opt.model_s, n_cls, opt.dataset)

    data = torch.randn(2, 3, 84, 84)
    model_t.eval()
    model_s.eval()
    feat_t, _ = model_t(data, is_feat=True)
    feat_s, _ = model_s(data, is_feat=True)

    module_list = nn.ModuleList([])
    module_list.append(model_s)
    trainable_list = nn.ModuleList([])
    trainable_list.append(model_s)

    criterion_cls = nn.CrossEntropyLoss()
    criterion_div = DistillKL(opt.kd_T)
    if opt.distill == 'kd':
        criterion_kd = DistillKL(opt.kd_T)
    elif opt.distill == 'contrast':
        criterion_kd = NCELoss(opt, n_data)
        embed_s = Embed(feat_s[-1].shape[1], opt.feat_dim)
        embed_t = Embed(feat_t[-1].shape[1], opt.feat_dim)
        module_list.append(embed_s)
        module_list.append(embed_t)
        trainable_list.append(embed_s)
        trainable_list.append(embed_t)
    elif opt.distill == 'attention':
        criterion_kd = Attention()
    elif opt.distill == 'hint':
        criterion_kd = HintLoss()
    else:
        raise NotImplementedError(opt.distill)

    criterion_list = nn.ModuleList([])
    criterion_list.append(criterion_cls)  # classification loss
    criterion_list.append(
        criterion_div)  # KL divergence loss, original knowledge distillation
    criterion_list.append(criterion_kd)  # other knowledge distillation loss

    # optimizer
    optimizer = optim.SGD(trainable_list.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    # append teacher after optimizer to avoid weight_decay
    module_list.append(model_t)

    if torch.cuda.is_available():
        module_list.cuda()
        criterion_list.cuda()
        cudnn.benchmark = True

    # validate teacher accuracy
    teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt)
    print('teacher accuracy: ', teacher_acc)

    # set cosine annealing scheduler
    if opt.cosine:
        eta_min = opt.learning_rate * (opt.lr_decay_rate**3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, opt.epochs, eta_min, -1)

    # routine: supervised model distillation
    for epoch in range(1, opt.epochs + 1):

        if opt.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, module_list,
                                      criterion_list, optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, test_acc_top5, test_loss = validate(val_loader, model_s,
                                                      criterion_cls, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc_top5', test_acc_top5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

    # save the last model
    state = {
        'opt': opt,
        'model': model_s.state_dict(),
    }
    save_file = os.path.join(opt.save_folder,
                             '{}_last.pth'.format(opt.model_s))
    torch.save(state, save_file)
Example #21
0
def main():

    args = parse_option()
    os.makedirs(args.checkpoint_path, exist_ok=True)

    if not args.debug:
        os.environ['PYTHONBREAKPOINT'] = '0'
        logger = get_logger(logpath=os.path.join(args.checkpoint_path, 'logs'),
                            filepath=os.path.abspath(__file__))

        def print_pass(*args):
            logger.info(*args)

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    print(args)

    train_loader = get_train_loader(args)

    isd = ISD(args.arch, K=args.queue_size, m=args.momentum, T=args.temp)
    isd.data_parallel()
    isd = isd.cuda()

    print(isd)

    criterion = KLD().cuda()

    params = [p for p in isd.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args.learning_rate,
                                momentum=args.sgd_momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True
    args.start_epoch = 1

    if args.resume:
        print('==> resume from checkpoint: {}'.format(args.resume))
        ckpt = torch.load(args.resume)
        print('==> resume from epoch: {}'.format(ckpt['epoch']))
        isd.load_state_dict(ckpt['state_dict'], strict=True)
        optimizer.load_state_dict(ckpt['optimizer'])
        args.start_epoch = ckpt['epoch'] + 1

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        loss = train_student(epoch, train_loader, isd, criterion, optimizer,
                             args)

        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # saving the model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'state_dict': isd.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }

            save_file = os.path.join(
                args.checkpoint_path,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

            # help release GPU memory
            del state
            torch.cuda.empty_cache()
Example #22
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1  # stliu: best accuracy
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    # stliu: add resnet_ttt
    if args.arch == 'resnet_ttt':
        model = moco.builder.MoCo(ResNetCifar,
                                  args.moco_dim,
                                  args.moco_k,
                                  args.moco_m,
                                  args.moco_t,
                                  args.mlp,
                                  width=args.width,
                                  norm=args.norm)
        _, ext, head, ssh = build_model(
            args, model.encoder_q
        )  # stliu: ext, head and ssh share same paras as encoder_q
        # stliu: SVM with model_val on single GPU
        norm_layer = get_norm(args.norm)
        model_val = ResNetCifar(num_classes=args.moco_dim,
                                width=args.width,
                                norm_layer=norm_layer)
    else:
        model = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim,
                                  args.moco_k, args.moco_m, args.moco_t,
                                  args.mlp)
    # print(model) # stliu: comment this

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            model_val.cuda(args.gpu)  # stliu: for SVM
            ssh = ssh.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            # stliu: add broadcast_buffers=False to use normal BN
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[args.gpu],
                broadcast_buffers=False,
                find_unused_parameters=True)
            ssh = torch.nn.parallel.DistributedDataParallel(
                ssh,
                device_ids=[args.gpu],
                broadcast_buffers=False,
                find_unused_parameters=True)
            # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
            # ssh = torch.nn.parallel.DistributedDataParallel(ssh, device_ids=[args.gpu])
        else:
            model.cuda()
            model_val.cuda()  # stliu: for SVM
            ssh = ssh.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(
                model, broadcast_buffers=False, find_unused_parameters=True)
            ssh = torch.nn.parallel.DistributedDataParallel(
                ssh, broadcast_buffers=False, find_unused_parameters=True)
            # model = torch.nn.parallel.DistributedDataParallel(model)
            # ssh = torch.nn.parallel.DistributedDataParallel(ssh)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        model_val = model_val.cuda(args.gpu)  # stliu: for SVM
        ssh = ssh.cuda(args.gpu)
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    parameters = list(model.parameters()) + list(head.parameters())
    optimizer = torch.optim.SGD(parameters,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            head.load_state_dict(checkpoint['head'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # stliu: I design it as a function
    train_loader, train_sampler, memory_loader, test_loader, teset = get_loader(
        args)

    if args.val:
        state_dict = model.state_dict()
        for k in list(state_dict.keys()):
            if k.startswith('module.encoder_q'
                            ) and not k.startswith('module.encoder_q.fc'):
                state_dict[k[len("module.encoder_q."):]] = state_dict[k]
            del state_dict[k]
        model_val.load_state_dict(state_dict, strict=False)
        flag_liblinear = '-s 2 -q -n ' + str(args.workers)
        if args.ttt:
            test_acc_svm = ttt_test(memory_loader, model, model_val,
                                    test_loader, flag_liblinear, args, ssh,
                                    teset, head)
        else:
            test_acc_svm = test(memory_loader, model, model_val, test_loader,
                                flag_liblinear, args, ssh)
        print('#### result ####\n' + args.val + ':', test_acc_svm,
              '\n################')
    else:
        # stliu: tensorboard
        logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

        for epoch in range(args.start_epoch,
                           args.epochs + 1):  # stliu: to save the last one
            if args.distributed:
                train_sampler.set_epoch(epoch)
            adjust_learning_rate(optimizer, epoch, args)

            # train for one epoch
            loss = train(train_loader, model, criterion, optimizer, epoch,
                         args, ssh)

            # stliu: tensorboard logger
            logger.log_value('loss', loss, epoch)

            if not args.multiprocessing_distributed or (
                    args.multiprocessing_distributed
                    and args.rank % ngpus_per_node == 0):
                if (epoch % args.save_freq == 0 and epoch != 0
                    ) or epoch == args.epochs:  # stliu: ignore the first model
                    print('==> Saving...')
                    save_checkpoint(
                        {
                            'epoch': epoch + 1,
                            'arch': args.arch,
                            'state_dict': model.state_dict(),
                            'head': head.state_dict(),
                            'optimizer': optimizer.state_dict(),
                        },
                        is_best=False,
                        filename=args.model_folder +
                        '/checkpoint_{:04d}.pth.tar'.format(epoch))
            # stliu: test with SVM
            if (epoch + 1) % args.svm_freq == 0:
                state_dict = model.state_dict()
                for k in list(state_dict.keys()):
                    if k.startswith('module.encoder_q') and not k.startswith(
                            'module.encoder_q.fc'):
                        state_dict[
                            k[len("module.encoder_q."):]] = state_dict[k]
                    del state_dict[k]
                model_val.load_state_dict(state_dict, strict=False)
                flag_liblinear = '-s 2 -q -n ' + str(args.workers)
                test_acc_svm = test(memory_loader, model, model_val,
                                    test_loader, flag_liblinear, args, ssh)

                # stliu: save the best model
                is_best = test_acc_svm > best_acc1
                best_acc1 = max(test_acc_svm, best_acc1)
                if is_best:
                    print('==> Saving the Best...')
                    save_checkpoint(
                        {
                            'epoch': epoch + 1,
                            'arch': args.arch,
                            'state_dict': model.state_dict(),
                            'head': head.state_dict(),
                            'optimizer': optimizer.state_dict(),
                        },
                        is_best=True,
                        filename=args.model_folder + '/best.pth.tar')
        print('The Best SVM Accuracy:', best_acc1)
Example #23
0
def main():
    # Make directories if they don't already exist
    util.make_directories()
    # Load model options
    model_options = constants.MAIN_MODEL_OPTIONS

    ########## DATA ##########
    if constants.PRINT_MODEL_STATUS: print("Loading data")

    dataset_map = util.load_dataset_map()
    train_captions, val_captions, test_captions = util.load_text_vec(
        'Data', constants.VEC_OUTPUT_FILE_NAME, dataset_map)
    train_image_dict, val_image_dict, test_image_dict = util.get_images(
        'Data', constants.DIRECTORY_PATH, constants.FLOWERS_DICTS_PATH)

    ########## MODEL ##########
    generator = CondBeganGenerator(model_options)
    discriminator = CondBeganDiscriminator(model_options)

    # Put G and D on cuda if GPU available
    if torch.cuda.is_available():
        if constants.PRINT_MODEL_STATUS: print("CUDA is available")
        generator = generator.cuda()
        discriminator = discriminator.cuda()
        if constants.PRINT_MODEL_STATUS: print("Moved models to GPU")

    # Initialize weights
    generator.apply(util.weights_init)
    discriminator.apply(util.weights_init)

    ########## SAVED VARIABLES #########
    new_epoch = 0
    began_k = 0
    train_losses = {"generator": [], "discriminator": [], "converge": []}
    val_losses = {"generator": [], "discriminator": [], "converge": []}
    losses = {'train': train_losses, 'val': val_losses}

    ########## OPTIMIZER ##########
    g_optimizer = optim.Adam(generator.parameters(),
                             lr=constants.LR,
                             betas=constants.BETAS)
    # Changes the optimizer to SGD if declared in constants
    if constants.D_OPTIMIZER_SGD:
        d_optimizer = optim.SGD(discriminator.parameters(), lr=constants.LR)
    else:
        d_optimizer = optim.Adam(discriminator.parameters(),
                                 lr=constants.LR,
                                 betas=constants.BETAS)
    if constants.PRINT_MODEL_STATUS: print("Added optimizers")

    ########## RESUME OPTION ##########
    if args.resume:
        print("Resuming from epoch " + args.resume)
        checkpoint = torch.load(constants.SAVE_PATH + 'weights/epoch' +
                                str(args.resume))
        new_epoch = checkpoint['epoch'] + 1
        generator.load_state_dict(checkpoint['g_dict'])
        discriminator.load_state_dict(checkpoint['d_dict'])
        began_k = checkpoint['began_k']
        g_optimizer.load_state_dict(checkpoint['g_optimizer'])
        d_optimizer.load_state_dict(checkpoint['d_optimizer'])
        losses = torch.load(constants.SAVE_PATH + 'losses')

    ########## VARIABLES ##########
    noise_vec = torch.FloatTensor(constants.BATCH_SIZE, model_options['z_dim'])
    text_vec = torch.FloatTensor(constants.BATCH_SIZE,
                                 model_options['caption_vec_len'])
    real_img = torch.FloatTensor(constants.BATCH_SIZE,
                                 model_options['image_channels'],
                                 constants.IMAGE_SIZE, constants.IMAGE_SIZE)
    real_caption = torch.FloatTensor(constants.BATCH_SIZE,
                                     model_options['caption_vec_len'])
    if constants.USE_CLS:
        wrong_img = torch.FloatTensor(constants.BATCH_SIZE,
                                      model_options['image_channels'],
                                      constants.IMAGE_SIZE,
                                      constants.IMAGE_SIZE)
        wrong_caption = torch.FloatTensor(constants.BATCH_SIZE,
                                          model_options['caption_vec_len'])

    # Add cuda GPU option
    if torch.cuda.is_available():
        noise_vec = noise_vec.cuda()
        text_vec = text_vec.cuda()
        real_img = real_img.cuda()
        real_caption = real_caption.cuda()
        if constants.USE_CLS: wrong_img = wrong_img.cuda()

    ########## Training ##########
    num_iterations = 0
    for epoch in range(new_epoch, constants.NUM_EPOCHS):
        print("Epoch %d" % (epoch))
        st = time.time()

        for i, batch_iter in enumerate(
                util.grouper(train_captions.keys(), constants.BATCH_SIZE)):
            batch_keys = [x for x in batch_iter if x is not None]
            curr_batch_size = len(batch_keys)

            discriminator.train()
            generator.train()
            discriminator.zero_grad()  # Zero out gradient
            # Save computations for gradient calculations
            for p in discriminator.parameters():
                p.requires_grad = True  # Need this to be true to update generator as well

            ########## BATCH DATA #########
            noise_batch = torch.randn(curr_batch_size, model_options['z_dim'])
            text_vec_batch = torch.Tensor(
                util.get_text_description(train_captions, batch_keys))
            real_caption_batch = torch.Tensor(
                util.get_text_description(train_captions, batch_keys))
            real_img_batch = torch.Tensor(
                util.choose_real_image(train_image_dict, batch_keys))
            if constants.USE_CLS:
                wrong_img_batch = torch.Tensor(
                    util.choose_wrong_image(train_image_dict, batch_keys))
            if torch.cuda.is_available():
                noise_batch = noise_batch.cuda()
                text_vec_batch = text_vec_batch.cuda()
                real_caption_batch = real_caption_batch.cuda()
                real_img_batch = real_img_batch.cuda()
                if constants.USE_CLS: wrong_img_batch = wrong_img_batch.cuda()

            # Fill in tensors with batch data
            noise_vec.resize_as_(noise_batch).copy_(noise_batch)
            text_vec.resize_as_(text_vec_batch).copy_(text_vec_batch)
            real_caption.resize_as_(text_vec_batch).copy_(text_vec_batch)
            real_img.resize_as_(real_img_batch).copy_(real_img_batch)
            if constants.USE_CLS:
                wrong_img.resize_as_(wrong_img_batch).copy_(wrong_img_batch)

            ########## RUN THROUGH GAN ##########
            gen_image = generator.forward(Variable(text_vec),
                                          Variable(noise_vec))

            real_img_passed = discriminator.forward(Variable(real_img),
                                                    Variable(real_caption))
            fake_img_passed = discriminator.forward(gen_image.detach(),
                                                    Variable(real_caption))
            if constants.USE_CLS:
                wrong_img_passed = discriminator.forward(
                    Variable(wrong_img), Variable(real_caption))

            ########## TRAIN DISCRIMINATOR ##########
            if constants.USE_REAL_LS:
                # Real loss sensitivity
                # L_D = L(y_r) - k * (L(y_f) + L(y_f, r))
                # L_G = L(y_f) +  L(y_f, r)
                # k = k + lambda_k * (gamma * L(y_r) + L(y_f) +  L(y_f, r))
                d_real_loss = torch.mean(
                    torch.abs(real_img_passed - Variable(real_img)))
                d_fake_loss = torch.mean(torch.abs(fake_img_passed -
                                                   gen_image))
                d_real_sensitivity_loss = torch.mean(
                    torch.abs(fake_img_passed - Variable(real_img)))
                d_loss = d_real_loss - began_k * (
                    0.5 * d_fake_loss + 0.5 * d_real_sensitivity_loss)

                # Update began k value
                balance = (model_options['began_gamma'] * d_real_loss -
                           0.5 * d_fake_loss -
                           0.5 * d_real_sensitivity_loss).data[0]
                began_k = min(
                    max(began_k + model_options['began_lambda_k'] * balance,
                        0), 1)
            elif constants.USE_CLS:
                # Cond BEGAN Discrminator Loss with CLS
                # L(y_w) is the caption loss sensitivity CLS (makes sure that captions match the image)
                # L_D = L(y_r) + L(y_f, w) - k * L(y_f)
                # L_G = L(y_f)
                # k = k + lambda_k * (gamma * (L(y_r) + L(y_f, w)) - L(y_f))
                d_real_loss = torch.mean(
                    torch.abs(real_img_passed - Variable(real_img)))
                d_wrong_loss = torch.mean(
                    torch.abs(fake_img_passed - Variable(wrong_img)))
                d_fake_loss = torch.mean(torch.abs(fake_img_passed -
                                                   gen_image))
                d_loss = 0.5 * d_real_loss + 0.5 * d_wrong_loss - began_k * d_fake_loss

                # Update began k value
                balance = (model_options['began_gamma'] *
                           (0.5 * d_real_loss + 0.5 * d_wrong_loss) -
                           d_fake_loss).data[0]
                began_k = min(
                    max(began_k + model_options['began_lambda_k'] * balance,
                        0), 1)
    # No CLS option
            else:
                # Cond BEGAN Discriminator Loss
                # L_D = L(y_r) - k * L(y_f)
                # k = k + lambda_k * (gamma * L(y_r) + L(y_f))
                d_real_loss = torch.mean(
                    torch.abs(real_img_passed - Variable(real_img)))
                d_fake_loss = torch.mean(torch.abs(fake_img_passed -
                                                   gen_image))
                d_loss = d_real_loss - began_k * d_fake_loss

                # Update began k value
                balance = (model_options['began_gamma'] * d_real_loss -
                           d_fake_loss).data[0]
                began_k = min(
                    max(began_k + model_options['began_lambda_k'] * balance,
                        0), 1)

            d_loss.backward()
            d_optimizer.step()

            ########## TRAIN GENERATOR ##########
            generator.zero_grad()
            for p in discriminator.parameters():
                p.requires_grad = False

            # Generate image again if you want to
            if constants.REGEN_IMAGE:
                noise_batch = torch.randn(curr_batch_size,
                                          model_options['z_dim'])
                if torch.cuda.is_available():
                    noise_batch = noise_batch.cuda()
                noise_vec.resize_as_(noise_batch).copy_(noise_batch)
                gen_image = generator.forward(Variable(text_vec),
                                              Variable(noise_vec))

            new_fake_img_passed = discriminator.forward(
                gen_image, Variable(real_caption))

            # Generator Loss
            # L_G = L(y_f)
            g_loss = torch.mean(torch.abs(new_fake_img_passed - gen_image))
            if constants.USE_REAL_LS:
                g_loss += torch.mean(
                    torch.abs(new_fake_img_passed - Variable(real_img)))
            elif constants.USE_CLS:
                g_loss -= torch.mean(
                    torch.abs(new_fake_img_passed - Variable(wrong_img)))

            g_loss.backward()
            g_optimizer.step()

            # M = L(y_r) + |gamma * L(y_r) - L(y_f)|
            convergence_val = d_real_loss + abs(balance)

            # learning rate decay
            g_optimizer = util.adjust_learning_rate(g_optimizer,
                                                    num_iterations)
            d_optimizer = util.adjust_learning_rate(d_optimizer,
                                                    num_iterations)

            if i % constants.LOSS_SAVE_IDX == 0:
                losses['train']['generator'].append((g_loss.data[0], epoch, i))
                losses['train']['discriminator'].append(
                    (d_loss.data[0], epoch, i))
                losses['train']['converge'].append(
                    (convergence_val.data[0], epoch, i))

            num_iterations += 1

        print('Total number of iterations: ', num_iterations)
        print('Training G Loss: ', g_loss.data[0])
        print('Training D Loss: ', d_loss.data[0])
        print('Training Convergence: ', convergence_val.data[0])
        print('K value: ', began_k)
        epoch_time = time.time() - st
        print("Time: ", epoch_time)

        if epoch == constants.REPORT_EPOCH:
            with open(constants.SAVE_PATH + 'report.txt', 'w') as f:
                f.write(constants.EXP_REPORT)
                f.write("Time per epoch: " + str(epoch_time))
            print("Saved report")

        ########## DEV SET #########
        # Calculate dev set loss
        # Volatile is true because we are running in inference mode (no need to calculate gradients)
        generator.eval()
        discriminator.eval()
        for i, batch_iter in enumerate(
                util.grouper(val_captions.keys(), constants.BATCH_SIZE)):
            batch_keys = [x for x in batch_iter if x is not None]
            curr_batch_size = len(batch_keys)

            # Gather batch data
            noise_batch = torch.randn(curr_batch_size, model_options['z_dim'])
            text_vec_batch = torch.Tensor(
                util.get_text_description(val_captions, batch_keys))
            real_caption_batch = torch.Tensor(
                util.get_text_description(val_captions, batch_keys))
            real_img_batch = torch.Tensor(
                util.choose_real_image(val_image_dict, batch_keys))
            if constants.USE_CLS:
                wrong_img_batch = torch.Tensor(
                    util.choose_wrong_image(val_image_dict, batch_keys))
            if torch.cuda.is_available():
                noise_batch = noise_batch.cuda()
                text_vec_batch = text_vec_batch.cuda()
                real_caption_batch = real_caption_batch.cuda()
                real_img_batch = real_img_batch.cuda()
                if constants.USE_CLS:
                    wrong_img_batch = wrong_img_batch.cuda()

            # Fill in tensors with batch data
            noise_vec.resize_as_(noise_batch).copy_(noise_batch)
            text_vec.resize_as_(text_vec_batch).copy_(text_vec_batch)
            real_caption.resize_as_(text_vec_batch).copy_(text_vec_batch)
            real_img.resize_as_(real_img_batch).copy_(real_img_batch)
            if constants.USE_CLS:
                wrong_img.resize_as_(wrong_img_batch).copy_(wrong_img_batch)

            # Run through generator
            gen_image = generator.forward(Variable(
                text_vec, volatile=True), Variable(
                    noise_vec,
                    volatile=True))  # Returns tensor variable holding image

            # Run through discriminator
            real_img_passed = discriminator.forward(
                Variable(real_img, volatile=True),
                Variable(real_caption, volatile=True))
            fake_img_passed = discriminator.forward(
                gen_image.detach(), Variable(real_caption, volatile=True))
            if constants.USE_CLS:
                wrong_img_passed = discriminator.forward(
                    Variable(wrong_img, volatile=True),
                    Variable(real_caption, volatile=True))

            # Calculate D loss
            # D LOSS
            if constants.USE_REAL_LS:
                d_real_loss = torch.mean(
                    torch.abs(real_img_passed - Variable(real_img)))
                d_fake_loss = torch.mean(torch.abs(fake_img_passed -
                                                   gen_image))
                d_real_sensitivity_loss = torch.mean(
                    torch.abs(fake_img_passed - Variable(real_img)))
                d_loss = d_real_loss - began_k * (
                    0.5 * d_fake_loss + 0.5 * d_real_sensitivity_loss)

                balance = (model_options['began_gamma'] * d_real_loss -
                           0.5 * d_fake_loss -
                           0.5 * d_real_sensitivity_loss).data[0]
            elif constants.USE_CLS:
                d_real_loss = torch.mean(
                    torch.abs(real_img_passed - Variable(real_img)))
                d_wrong_loss = torch.mean(
                    torch.abs(fake_img_passed - Variable(wrong_img)))
                d_fake_loss = torch.mean(torch.abs(fake_img_passed -
                                                   gen_image))
                d_loss = 0.5 * d_real_loss + 0.5 * d_wrong_loss - began_k * d_fake_loss

                balance = (model_options['began_gamma'] *
                           (0.5 * d_real_loss + 0.5 * d_wrong_loss) -
                           d_fake_loss).data[0]
    # No CLS option
            else:
                d_real_loss = torch.mean(
                    torch.abs(real_img_passed - Variable(real_img)))
                d_fake_loss = torch.mean(torch.abs(fake_img_passed -
                                                   gen_image))
                d_loss = d_real_loss - began_k * d_fake_loss

                # Update began k value
                balance = (model_options['began_gamma'] * d_real_loss -
                           d_fake_loss).data[0]

            # Calculate G loss
            if constants.USE_REAL_LS:
                g_loss = 0.5 * torch.mean(
                    torch.abs(fake_img_passed - gen_image))
                g_loss += 0.5 * torch.mean(
                    torch.abs(fake_img_passed - Variable(real_img)))
            elif constants.USE_CLS:
                g_loss = torch.mean(torch.abs(fake_img_passed - gen_image))
                g_loss -= 0.5 * torch.mean(
                    torch.abs(fake_img_passed - Variable(wrong_img)))
            else:
                # L_G = L(y_f)
                g_loss = torch.mean(torch.abs(fake_img_passed - gen_image))

            # M = L(y_r) + |gamma * L(y_r) - L(y_f)|
            convergence_val = d_real_loss + abs(balance)

            if i % constants.LOSS_SAVE_IDX == 0:
                losses['val']['generator'].append((g_loss.data[0], epoch, i))
                losses['val']['discriminator'].append(
                    (d_loss.data[0], epoch, i))
                losses['val']['converge'].append(
                    (convergence_val.data[0], epoch, i))

        print('Val G Loss: ', g_loss.data[0])
        print('Val D Loss: ', d_loss.data[0])
        print('Val Convergence: ', convergence_val.data[0])

        # Save losses
        torch.save(losses, constants.SAVE_PATH + 'losses')

        # Save images
        vutils.save_image(gen_image[0].data.cpu(),
                          constants.SAVE_PATH + 'images/gen0_epoch' +
                          str(epoch) + '.png',
                          normalize=True)
        vutils.save_image(gen_image[1].data.cpu(),
                          constants.SAVE_PATH + 'images/gen1_epoch' +
                          str(epoch) + '.png',
                          normalize=True)
        vutils.save_image(fake_img_passed[0].data.cpu(),
                          constants.SAVE_PATH + 'images/gen_recon0_epoch' +
                          str(epoch) + '.png',
                          normalize=True)
        vutils.save_image(fake_img_passed[1].data.cpu(),
                          constants.SAVE_PATH + 'images/gen_recon1_epoch' +
                          str(epoch) + '.png',
                          normalize=True)
        # vutils.save_image(real_img_passed[0].data.cpu(),
        #             constants.SAVE_PATH + 'images/real_recon0_epoch' + str(epoch) + '.png',
        #             normalize=True)
        # vutils.save_image(real_img_passed[1].data.cpu(),
        #             constants.SAVE_PATH + 'images/real_recon1_epoch' + str(epoch) + '.png',
        #             normalize=True)

        # Save model
        if epoch % constants.CHECKPOINT_FREQUENCY == 0 and epoch != 0 or epoch == constants.NUM_EPOCHS - 1:
            save_checkpoint = {
                'epoch': epoch,
                'g_dict': generator.state_dict(),
                'd_dict': discriminator.state_dict(),
                'g_optimizer': g_optimizer.state_dict(),
                'd_optimizer': d_optimizer.state_dict(),
                'began_k': began_k
            }
            torch.save(save_checkpoint,
                       constants.SAVE_PATH + 'weights/epoch' + str(epoch))
Example #24
0
def main():

    opt = parse_option()

    # dataloader
    train_partition = 'trainval' if opt.use_trainval else 'train'
    if opt.dataset == 'miniImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        train_loader = DataLoader(ImageNet(args=opt,
                                           partition=train_partition,
                                           transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(ImageNet(args=opt,
                                         partition='val',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaImageNet(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaImageNet(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            n_cls = 64
    elif opt.dataset == 'tieredImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        train_loader = DataLoader(TieredImageNet(args=opt,
                                                 partition=train_partition,
                                                 transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(TieredImageNet(args=opt,
                                               partition='train_phase_val',
                                               transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='test',
            train_transform=train_trans,
            test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='val',
            train_transform=train_trans,
            test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 448
        else:
            n_cls = 351
    elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100':
        train_trans, test_trans = transforms_options['D']

        train_loader = DataLoader(CIFAR100(args=opt,
                                           partition=train_partition,
                                           transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(CIFAR100(args=opt,
                                         partition='train',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaCIFAR100(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaCIFAR100(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            if opt.dataset == 'CIFAR-FS':
                n_cls = 64
            elif opt.dataset == 'FC100':
                n_cls = 60
            else:
                raise NotImplementedError('dataset not supported: {}'.format(
                    opt.dataset))
    else:
        raise NotImplementedError(opt.dataset)

    # model
    if not opt.load_latest:
        model = create_model(opt.model, n_cls, opt.dataset)
    else:
        latest_file = os.path.join(opt.save_folder, 'latest.pth')
        model = load_teacher(latest_file, n_cls, opt.dataset)

    # optimizer
    if opt.adam:
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=opt.learning_rate,
                                     weight_decay=0.0005)
    else:
        optimizer = optim.SGD(model.parameters(),
                              lr=opt.learning_rate,
                              momentum=opt.momentum,
                              weight_decay=opt.weight_decay)

    criterion = nn.CrossEntropyLoss()

    if torch.cuda.is_available():
        if opt.n_gpu > 1:
            model = nn.DataParallel(model)
        model = model.cuda()
        criterion = criterion.cuda()
        cudnn.benchmark = True

    # tensorboard
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # set cosine annealing scheduler
    if opt.cosine:
        eta_min = opt.learning_rate * (opt.lr_decay_rate**3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, opt.epochs, eta_min, -1)

    # routine: supervised pre-training
    for epoch in range(1, opt.epochs + 1):

        if opt.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, model, criterion,
                                      optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, test_acc_top5, test_loss = validate(val_loader, model,
                                                      criterion, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc_top5', test_acc_top5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch':
                epoch,
                'model':
                model.state_dict()
                if opt.n_gpu <= 1 else model.module.state_dict(),
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            latest_file = os.path.join(opt.save_folder, 'latest.pth')
            os.symlink(save_file, latest_file)

    # save the last model
    state = {
        'opt':
        opt,
        'model':
        model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(),
    }
    save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model))
    torch.save(state, save_file)
Example #25
0
def main():
    opt = parse_option()

    with open(f"{opt.tb_folder}/config.json", "w") as fo:
        fo.write(json.dumps(vars(opt), indent=4))

    # dataloader
    train_partition = 'trainval' if opt.use_trainval else 'train'
    if opt.dataset == 'miniImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        train_loader = DataLoader(ImageNet(args=opt,
                                           partition=train_partition,
                                           transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(ImageNet(args=opt,
                                         partition='val',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaImageNet(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaImageNet(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            n_cls = 64
    elif opt.dataset == 'tieredImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        train_loader = DataLoader(TieredImageNet(args=opt,
                                                 partition=train_partition,
                                                 transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(TieredImageNet(args=opt,
                                               partition='train_phase_val',
                                               transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='test',
            train_transform=train_trans,
            test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='val',
            train_transform=train_trans,
            test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 448
        else:
            n_cls = 351
    elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100':
        train_trans, test_trans = transforms_options['D']

        train_loader = DataLoader(CIFAR100(args=opt,
                                           partition=train_partition,
                                           transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(CIFAR100(args=opt,
                                         partition='train',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaCIFAR100(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaCIFAR100(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            if opt.dataset == 'CIFAR-FS':
                n_cls = 64
            elif opt.dataset == 'FC100':
                n_cls = 60
            else:
                raise NotImplementedError('dataset not supported: {}'.format(
                    opt.dataset))
    elif opt.dataset == "imagenet":
        train_trans, test_trans = transforms_options["A"]
        train_dataset = ImagenetFolder(root=os.path.join(
            opt.data_root, "train"),
                                       transform=train_trans)
        val_dataset = ImagenetFolder(root=os.path.join(opt.data_root, "val"),
                                     transform=test_trans)
        train_loader = DataLoader(train_dataset,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(val_dataset,
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        n_cls = 1000
    else:
        raise NotImplementedError(opt.dataset)

    # model
    model = create_model(opt.model, n_cls, opt.dataset, use_srl=opt.srl)

    # optimizer
    if opt.adam:
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=opt.learning_rate,
                                     weight_decay=0.0005)
    else:
        optimizer = optim.SGD(model.parameters(),
                              lr=opt.learning_rate,
                              momentum=opt.momentum,
                              weight_decay=opt.weight_decay)

    if opt.label_smoothing:
        criterion = LabelSmoothing(smoothing=opt.smoothing_ratio)
    elif opt.gce:
        criterion = GuidedComplementEntropy(alpha=opt.gce_alpha, classes=n_cls)
    else:
        criterion = nn.CrossEntropyLoss()
    if opt.opl:
        auxiliary_loss = OrthogonalProjectionLoss(use_attention=True)
    elif opt.popl:
        auxiliary_loss = PerpetualOrthogonalProjectionLoss(feat_dim=640)
    else:
        auxiliary_loss = None

    if torch.cuda.is_available():
        if opt.n_gpu > 1:
            model = nn.DataParallel(model)
        model = model.cuda()
        criterion = criterion.cuda()
        if auxiliary_loss is not None:
            auxiliary_loss = auxiliary_loss.cuda()
        cudnn.benchmark = True

    # tensorboard
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # set cosine annealing scheduler
    if opt.cosine:
        eta_min = opt.learning_rate * (opt.lr_decay_rate**3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, opt.epochs, eta_min, -1)
    else:
        scheduler = None

    # routine: supervised pre-training
    for epoch in range(1, opt.epochs + 1):

        if opt.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        if auxiliary_loss is not None:
            train_acc, train_loss, [train_cel, train_opl
                                    ] = train(epoch=epoch,
                                              train_loader=train_loader,
                                              model=model,
                                              criterion=criterion,
                                              optimizer=optimizer,
                                              opt=opt,
                                              auxiliary=auxiliary_loss)
        else:
            train_acc, train_loss = train(epoch=epoch,
                                          train_loader=train_loader,
                                          model=model,
                                          criterion=criterion,
                                          optimizer=optimizer,
                                          opt=opt)

        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('accuracy/train_acc', train_acc, epoch)
        logger.log_value('train_losses/loss', train_loss, epoch)
        if auxiliary_loss is not None:
            logger.log_value('train_losses/cel', train_cel, epoch)
            logger.log_value('train_losses/opl', train_opl, epoch)
        else:
            logger.log_value('train_losses/cel', train_loss, epoch)

        if auxiliary_loss is not None:
            test_acc, test_acc_top5, test_loss, [test_cel, test_opl] = \
                validate(val_loader, model, criterion, opt, auxiliary=auxiliary_loss)
        else:
            test_acc, test_acc_top5, test_loss = validate(
                val_loader, model, criterion, opt)

        logger.log_value('accuracy/test_acc', test_acc, epoch)
        logger.log_value('accuracy/test_acc_top5', test_acc_top5, epoch)
        logger.log_value('test_losses/loss', test_loss, epoch)
        if auxiliary_loss is not None:
            logger.log_value('test_losses/cel', test_cel, epoch)
            logger.log_value('test_losses/opl', test_opl, epoch)
        else:
            logger.log_value('test_losses/cel', test_loss, epoch)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch':
                epoch,
                'model':
                model.state_dict()
                if opt.n_gpu <= 1 else model.module.state_dict(),
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

    # save the last model
    state = {
        'opt':
        opt,
        'model':
        model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(),
    }
    save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model))
    torch.save(state, save_file)
Example #26
0
def train_net(model, args):

    ann_path = '../FashionAI/data/train/Annotations/trainminusval.csv'
    img_dir = '../FashionAI/data/train/'

    stride = 8
    cudnn.benchmark = True
    config = util.Config('./config.yml')
    train_loader = torch.utils.data.DataLoader(dataset_loader.dataset_loader(
        img_dir,
        ann_path,
        stride,
        Mytransforms.Compose([
            Mytransforms.RandomResized(),
            Mytransforms.RandomRotate(40),
            Mytransforms.RandomCrop(384),
        ]),
        sigma=15),
                                               batch_size=config.batch_size,
                                               shuffle=True,
                                               num_workers=config.workers,
                                               pin_memory=True)

    criterion = nn.MSELoss().cuda()
    params = []
    for key, value in model.named_parameters():
        if value.requires_grad != False:
            params.append({'params': value, 'lr': config.base_lr})

    optimizer = torch.optim.SGD(params,
                                config.base_lr,
                                momentum=config.momentum,
                                weight_decay=config.weight_decay)
    # model.train() # only for bn and dropout
    model.eval()

    from matplotlib import pyplot as plt

    iters = 0
    batch_time = util.AverageMeter()
    data_time = util.AverageMeter()
    losses = util.AverageMeter()
    losses_list = [util.AverageMeter() for i in range(12)]
    end = time.time()

    heat_weight = 48 * 48 * 25 / 2.0  # for convenient to compare with origin code
    # heat_weight = 1

    while iters < config.max_iter:
        for i, (input, heatmap) in enumerate(train_loader):
            learning_rate = util.adjust_learning_rate(optimizer, iters, config.base_lr, policy=config.lr_policy,\
                 policy_parameter=config.policy_parameter)
            data_time.update(time.time() - end)

            input = input.cuda(async=True)
            heatmap = heatmap.cuda(async=True)
            input_var = torch.autograd.Variable(input)
            heatmap_var = torch.autograd.Variable(heatmap)

            heat = model(input_var)

            # feat = C4.cpu().data.numpy()
            # for n in range(100):
            # 	plt.subplot(10, 10, n + 1);
            # 	plt.imshow(feat[0, n, :, :], cmap='gray')
            # 	plt.xticks([]);
            # 	plt.yticks([])
            # plt.show()

            loss1 = criterion(heat, heatmap_var) * heat_weight
            # loss2 = criterion(heat4, heatmap_var) * heat_weight
            # loss3 = criterion(heat5, heatmap_var) * heat_weight
            # loss4 = criterion(heat6, heatmap_var) * heat_weight
            # loss5 = criterion(heat, heatmap_var)
            # loss6 = criterion(heat, heatmap_var)

            loss = loss1  # + loss2 + loss3# + loss4# + loss5 + loss6
            losses.update(loss.data[0], input.size(0))
            loss_list = [loss1]  #, loss2, loss3]# , loss4 ]# , loss5 , loss6]
            for cnt, l in enumerate(loss_list):
                losses_list[cnt].update(l.data[0], input.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            batch_time.update(time.time() - end)
            end = time.time()

            iters += 1
            if iters % config.display == 0:
                print(
                    'Train Iteration: {0}\t'
                    'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                    'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                    'Learning rate = {2}\n'
                    'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format(
                        iters,
                        config.display,
                        learning_rate,
                        batch_time=batch_time,
                        data_time=data_time,
                        loss=losses))
                for cnt in range(0, 1):
                    print(
                        'Loss{0}_1 = {loss1.val:.8f} (ave = {loss1.avg:.8f})'.
                        format(cnt + 1, loss1=losses_list[cnt]))
                print(
                    time.strftime(
                        '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
                        time.localtime()))

                batch_time.reset()
                data_time.reset()
                losses.reset()
                for cnt in range(12):
                    losses_list[cnt].reset()

            if iters % 5000 == 0:
                torch.save({
                    'iter': iters,
                    'state_dict': model.state_dict(),
                },
                           str(iters) + '.pth.tar')

            if iters == config.max_iter:
                break
    return
Example #27
0
        epoch_loss += c_loss

        optimizer.step()

        if ind % 100 == 0:
            print("iter [%d] CLoss: %.4f" % (ind, c_loss))

        if ind > args.max_iter:
            break

    print("Epoch [%d] Loss: %.4f" % (epoch + 1, epoch_loss))
    log_value('loss', epoch_loss, epoch)
    log_value('lr', args.lr, epoch)

    if args.adjust_lr:
        args.lr = adjust_learning_rate(optimizer, args.lr, args.weight_decay,
                                       epoch, args.epochs)

    if args.net == "fcn" or args.net == "psp":
        checkpoint_fn = os.path.join(
            args.pth_dir, "%s-%s-res%s-%s.pth.tar" %
            (args.savename, args.net, args.res, epoch + 1))
    else:
        checkpoint_fn = os.path.join(
            args.pth_dir,
            "%s-%s-%s.pth.tar" % (args.savename, args.net, epoch + 1))

    args.start_epoch = epoch + 1
    save_dic = {
        'args': args,
        'epoch': epoch + 1,
        'g1_state_dict': model_g1.state_dict(),
def main():

    opt = parse_option()
    wandb.init(project=opt.model_path.split("/")[-1], tags=opt.tags)
    wandb.config.update(opt)
    wandb.save('*.py')
    wandb.run.save()

    train_loader, val_loader, meta_testloader, meta_valloader, n_cls, no_sample = get_dataloaders(
        opt)
    # model
    model = create_model(opt.model,
                         n_cls,
                         opt.dataset,
                         n_trans=opt.trans,
                         embd_sz=opt.memfeature_size)
    wandb.watch(model)

    # optimizer
    if opt.adam:
        print("Adam")
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=opt.learning_rate,
                                     weight_decay=0.0005)
    else:
        print("SGD")
        optimizer = optim.SGD(model.parameters(),
                              lr=opt.learning_rate,
                              momentum=opt.momentum,
                              weight_decay=opt.weight_decay)

    criterion = nn.CrossEntropyLoss()

    if torch.cuda.is_available():
        if opt.n_gpu > 1:
            model = nn.DataParallel(model)
        model = model.cuda()
        criterion = criterion.cuda()
        cudnn.benchmark = True

    # set cosine annealing scheduler
    if opt.cosine:
        eta_min = opt.learning_rate * (opt.lr_decay_rate**3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, opt.epochs, eta_min, -1)

    MemBank = np.random.randn(no_sample, opt.memfeature_size)
    MemBank = torch.tensor(MemBank, dtype=torch.float).cuda()
    MemBankNorm = torch.norm(MemBank, dim=1, keepdim=True)
    MemBank = MemBank / (MemBankNorm + 1e-6)

    # routine: supervised pre-training
    for epoch in range(1, opt.epochs + 1):
        if opt.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss, MemBank = train(epoch, train_loader, model,
                                               criterion, optimizer, opt,
                                               MemBank)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        val_acc, val_acc_top5, val_loss = 0, 0, 0  #validate(val_loader, model, criterion, opt)

        #validate
        start = time.time()
        meta_val_acc, meta_val_std = 0, 0  #meta_test(model, meta_valloader)
        test_time = time.time() - start
        print(
            'Meta Val Acc : {:.4f}, Meta Val std: {:.4f}, Time: {:.1f}'.format(
                meta_val_acc, meta_val_std, test_time))

        #evaluate
        start = time.time()
        meta_test_acc, meta_test_std = 0, 0  #meta_test(model, meta_testloader)
        test_time = time.time() - start
        print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}, Time: {:.1f}'.
              format(meta_test_acc, meta_test_std, test_time))

        # regular saving
        if epoch % opt.save_freq == 0 or epoch == opt.epochs:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
            }
            save_file = os.path.join(opt.save_folder,
                                     'model_' + str(wandb.run.name) + '.pth')
            torch.save(state, save_file)

            #wandb saving
            torch.save(state, os.path.join(wandb.run.dir, "model.pth"))

        wandb.log({
            'epoch': epoch,
            'Train Acc': train_acc,
            'Train Loss': train_loss,
            'Val Acc': val_acc,
            'Val Loss': val_loss,
            'Meta Test Acc': meta_test_acc,
            'Meta Test std': meta_test_std,
            'Meta Val Acc': meta_val_acc,
            'Meta Val std': meta_val_std
        })

    #final report
    print("GENERATING FINAL REPORT")
    generate_final_report(model, opt, wandb)

    #remove output.txt log file
    output_log_file = os.path.join(wandb.run.dir, "output.log")
    if os.path.isfile(output_log_file):
        os.remove(output_log_file)
    else:  ## Show an error ##
        print("Error: %s file not found" % output_log_file)
Example #29
0
def main():

    args = parse_option()
    os.makedirs(args.checkpoint_path, exist_ok=True)

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    train_loader = get_train_loader(args)

    teacher = get_teacher_model(args)
    student = get_student_model(args)

    # Calculate feature dimension of student and teacher
    teacher.eval()
    student.eval()
    tmp_input = torch.randn(2, 3, 224, 224)
    feat_t = teacher.forward(tmp_input, 0)
    feat_s = student(tmp_input)
    student_feats_dim = feat_s.shape[-1]
    teacher_feats_dim = feat_t.shape[-1]

    compress = CompReSS(teacher_feats_dim, student_feats_dim,
                        args.compress_memory_size, args.compress_t)

    student = torch.nn.DataParallel(student).cuda()
    teacher.gpu()

    optimizer = torch.optim.SGD(student.parameters(),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True

    args.start_epoch = 1
    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        loss = train_student(epoch, train_loader, teacher, student, compress,
                             optimizer, args)

        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # saving the model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': student.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }

            save_file = os.path.join(
                args.checkpoint_path,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

            # help release GPU memory
            del state
            torch.cuda.empty_cache()
Example #30
0
def train_net(model, args):
	ann_path = '../FashionAI/data/train/Annotations/trainminusval.csv'
	img_dir = '../FashionAI/data/train/'

	stride = 8
	cudnn.benchmark = True
	config = util.Config('./config.yml')

	train_loader = torch.utils.data.DataLoader(
		dataset_loader.dataset_loader(img_dir, ann_path, stride,
		                              transforms.ToTensor()),
		batch_size=config.batch_size, shuffle=True,
		num_workers=config.workers, pin_memory=True)

	criterion = nn.MSELoss().cuda()
	params, multiple = get_parameters(model, config, False)

	optimizer = torch.optim.SGD(params, config.base_lr, momentum=config.momentum,
	                            weight_decay=config.weight_decay)
	model.train()
	iters = 0
	batch_time = util.AverageMeter()
	data_time = util.AverageMeter()
	losses = util.AverageMeter()
	losses_list = [util.AverageMeter() for i in range(12)]
	end = time.time()

	heat_weight = 48 * 48 * 25 / 2.0  # for convenient to compare with origin code
	# heat_weight = 1

	while iters < config.max_iter:
		for i, (input, heatmap) in enumerate(train_loader):
			learning_rate = util.adjust_learning_rate(optimizer, iters, config.base_lr, policy=config.lr_policy,\
								policy_parameter=config.policy_parameter, multiple=multiple)
			data_time.update(time.time() - end)

			input = input.cuda(async=True)
			heatmap = heatmap.cuda(async=True)
			input_var = torch.autograd.Variable(input)
			heatmap_var = torch.autograd.Variable(heatmap)

			heat1, heat2, heat3, heat4, heat5, heat6 = model(input_var)
			loss1 = criterion(heat1,heatmap_var) * heat_weight
			loss2 = criterion(heat2, heatmap_var) * heat_weight
			loss3 = criterion(heat3, heatmap_var) * heat_weight
			loss4 = criterion(heat4, heatmap_var) * heat_weight
			loss5 = criterion(heat5, heatmap_var) * heat_weight
			loss6 = criterion(heat6, heatmap_var) * heat_weight
			loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6
			losses.update(loss.data[0], input.size(0))
			loss_list = [loss1 , loss2 , loss3 , loss4 , loss5 , loss6]
			for cnt, l in enumerate(loss_list):
				losses_list[cnt].update(l.data[0], input.size(0))

			optimizer.zero_grad()
			loss.backward()
			optimizer.step()
			batch_time.update(time.time() - end)
			end = time.time()


			iters += 1
			if iters % config.display == 0:
				print('Train Iteration: {0}\t'
				      'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
				      'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
				      'Learning rate = {2}\n'
				      'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format(
					iters, config.display, learning_rate, batch_time=batch_time,
					data_time=data_time, loss=losses))
				for cnt in range(0, 6):
					print('Loss{0}_1 = {loss1.val:.8f} (ave = {loss1.avg:.8f})'.format(cnt + 1,loss1=losses_list[cnt]))
				print(time.strftime(
					'%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
					time.localtime()))

				batch_time.reset()
				data_time.reset()
				losses.reset()
				for cnt in range(12):
					losses_list[cnt].reset()

			if iters % 5000 == 0:
				torch.save({
					'iter': iters,
					'state_dict': model.state_dict(),
				},  str(iters) + '.pth.tar')

			if iters == config.max_iter:
				break
	return