def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
def main(): global args, best_prec1 args = parser.parse_args() with open(args.config) as f: config = yaml.load(f) for key in config: for k, v in config[key].items(): setattr(args, k, v) args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model #print("=> creating model '{}'".format(args.model)) #if 'se_resnext50_32x4d_v1_sn' in args.model: # model = models.__dict__[args.model](using_moving_average = args.using_moving_average, last_gamma=args.last_gamma) #else: # model = models.__dict__[args.model](using_moving_average=args.using_moving_average) #model = resnet18() model = ResNet18() #model = SENet18() if not args.distributed: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # auto resume from a checkpoint model_dir = args.model_dir start_epoch = 0 if not os.path.exists(model_dir) : os.makedirs(model_dir) if args.evaluate: utils.load_state_ckpt(args.checkpoint_path, model) else: best_prec1, start_epoch = utils.load_state(model_dir, model, optimizer=optimizer) writer = SummaryWriter(model_dir) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, 0, writer) return train_dataset_multi_scale = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ColorAugmentation(), normalize, ])) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ColorAugmentation(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader_multi_scale = torch.utils.data.DataLoader( train_dataset_multi_scale, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) if not args.using_moving_average: train_dataset_snhelper = datasets.ImageFolder( traindir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) train_loader_snhelper = torch.utils.data.DataLoader( train_dataset_snhelper, batch_size=args.batch_size * torch.cuda.device_count(), shuffle=(train_sampler is None), #train_dataset_snhelper, batch_size=1, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) niters = len(train_loader) lr_scheduler = LRScheduler(optimizer, niters, args) for epoch in range(start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch if epoch < args.epochs - 5: train(train_loader_multi_scale, model, criterion, optimizer, lr_scheduler, epoch, writer) else: train(train_loader, model, criterion, optimizer, lr_scheduler, epoch, writer) if not args.using_moving_average: sn_helper(train_loader_snhelper, model) # evaluate on validation set prec1 = validate(val_loader, model, criterion, epoch, writer) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) utils.save_checkpoint(model_dir, { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def open_sesemi(): args = parse_args() network = args.network nb_extra = args.nb_extra os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id arg2var = { 'convnet': convnet, 'wrn': wrn, } # Load Tiny Images. # Code adapted from https://github.com/smlaine2/tempens with open('./datasets/tiny-images/tiny_index.pkl', 'rb') as f: tinyimg_index = pickle.load(f, encoding='latin1') if nb_extra == 237203: print("Using all classes common with CIFAR-100.") with open('./datasets/cifar-100/meta', 'rb') as f: cifar_labels = pickle.load(f, encoding='latin1')['fine_label_names'] cifar_to_tinyimg = {'maple_tree': 'maple', 'aquarium_fish': 'fish'} cifar_labels = [ l if l not in cifar_to_tinyimg else cifar_to_tinyimg[l] for l in cifar_labels ] load_indices = sum( [list(range(*tinyimg_index[label])) for label in cifar_labels], []) elif nb_extra == 500000: print("Using %d random images." % nb_extra) nb_tinyimages = max(e for s, e in tinyimg_index.values()) load_indices = np.arange(nb_tinyimages) rng.shuffle(load_indices) load_indices = load_indices[:nb_extra] load_indices.sort() # sorted for faster seeks. else: raise ValueError('`--extra` must be integer 237203 or 500000.') print("Loading %d auxiliary unlabeled Tiny Images." % len(load_indices)) z_train = load_tinyimages(load_indices) # Load CIFAR-100. (x_train, y_train), (x_test, y_test) = cifar100.load_data() x_test = global_contrast_normalize(x_test) x_train = global_contrast_normalize(x_train) z_train = global_contrast_normalize(z_train) zca_whiten = zca_whitener(np.concatenate([x_train, z_train], axis=0)) x_test = zca_whiten(x_test) x_train = zca_whiten(x_train) z_train = zca_whiten(z_train) x_test = x_test.reshape((len(x_test), 32, 32, 3)) x_train = x_train.reshape((len(x_train), 32, 32, 3)) z_train = z_train.reshape((len(z_train), 32, 32, 3)) y_train = to_categorical(y_train) # Shared training parameters. zca = True hflip = True epochs = 50 base_lr = 0.05 batch_size = 8 nb_classes = 100 lr_decay_power = 0.5 super_dropout = 0.2 in_network_dropout = 0.0 input_shape = (32, 32, 3) max_iter = (len(x_train) // batch_size) * epochs # Compile the SESEMI model. sesemi_model, inference_model = compile_sesemi(arg2var[network], input_shape, nb_classes, base_lr, in_network_dropout, super_dropout) print(sesemi_model.summary()) lr_poly_decay = LRScheduler(base_lr, max_iter, lr_decay_power) evaluate = DenseEvaluator(inference_model, (x_test, y_test), hflip, oversample=True) super_datagen = ImageDataGenerator( width_shift_range=[-2, -1, 0, 1, 2], height_shift_range=[-2, -1, 0, 1, 2], horizontal_flip=hflip, preprocessing_function=gaussian_noise, fill_mode='reflect', ) self_datagen = ImageDataGenerator( width_shift_range=[-2, -1, 0, 1, 2], height_shift_range=[-2, -1, 0, 1, 2], horizontal_flip=False, preprocessing_function=gaussian_noise, fill_mode='reflect', ) super_data = super_datagen.flow(x_train, y_train, shuffle=True, batch_size=1, seed=None) self_data = self_datagen.flow(x_train, shuffle=True, batch_size=1, seed=None) extra_data = self_datagen.flow(z_train, shuffle=True, batch_size=1, seed=None) train_data_loader = datagen_tinyimages(super_data, self_data, extra_data, batch_size) # Fit the SESEMI model on mini-batches with data augmentation. print('Run configuration:') print('network=%s,' % network, 'ZCA=%s,' % zca, 'nb_epochs=%d,' % epochs, \ 'horizontal_flip=%s,' % hflip, 'nb_extra=%d,' % len(z_train), \ 'batch_size=%d,' % batch_size, 'gpu_id=%s' % args.gpu_id) sesemi_model.fit_generator( train_data_loader, epochs=epochs, verbose=1, steps_per_epoch=len(x_train) // batch_size, callbacks=[lr_poly_decay, evaluate], )
def main(): # Device configuration device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Load config print(args.config, flush=True) with open(args.config, 'r') as f: opt = yaml.safe_load(f) for k, v in opt.items(): print('{} : {}'.format(k, v), flush=True) setattr(args, k, v) if not os.path.exists(args.model_save_path): os.mkdir(args.model_save_path) # Train if not args.evaluate and not args.val: # Dataset train_dataset = HFDataset(args.data_root, args.train_csv_path, args.data_lens, train=True) val_dataset = HFDataset(args.data_root, args.validation_csv_path, args.data_lens, train=False) # Data loader train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=32, shuffle=False, num_workers=1) # Model model = getattr(models, args.model)(args.num_classes).to(device) if args.load_model_path: model.load(args.load_model_path) # Loss and optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) if args.loss == 'weighted_binary_crossentropy': weight = train_dataset.weight weight = torch.Tensor(weight).unsqueeze(0).to(device) criterion = getattr(loss, args.loss)(weight) else: criterion = getattr(loss, args.loss) niters = len(train_loader) lr_scheduler = LRScheduler(optimizer, niters, args) # Run train(train_loader, val_loader, model, optimizer, criterion, lr_scheduler, device, args) # Val elif args.val: val_dataset = HFDataset(args.data_root, args.validation_csv_path, args.data_lens, train=False) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=1, shuffle=False, num_workers=1) model = getattr(models, args.model)(args.num_classes).to(device) assert args.load_model_path model.load(args.load_model_path) acc_val = val(val_loader, model, device, flip=args.flip) print('Validation Accuracy: {} %'.format(acc_val), flush=True) # Test elif args.evaluate: model = getattr(models, args.model)(args.num_classes).to(device) assert args.load_model_path model.load(args.load_model_path) test(model, args, device) return
pred = np.argmax(output.data, axis=1) acc = np.sum(pred == labels.data) total_acc += acc total_num += labels.shape[0] acc = total_acc / total_num return acc if __name__ == '__main__': freeze_random_seed() net = PointNet(n_classes=40) optimizer = nn.Adam(net.parameters(), lr=1e-3) lr_scheduler = LRScheduler(optimizer) batch_size = 32 train_dataloader = ModelNet40(n_points=4096, batch_size=batch_size, train=True, shuffle=True) val_dataloader = ModelNet40(n_points=4096, batch_size=batch_size, train=False, shuffle=False) step = 0 best_acc = 0 for epoch in range(1000): lr_scheduler.step(len(train_dataloader) * batch_size) train(net, optimizer, epoch, train_dataloader) acc = evaluate(net, epoch, val_dataloader) best_acc = max(best_acc, acc) print(f'val acc={acc:.4f}, best={best_acc:.4f}')
def open_sesemi(): args = parse_args() network = args.network dataset = args.dataset nb_labels = args.nb_labels os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id arg2var = { 'convnet': convnet, 'wrn': wrn, 'nin': nin, 'svhn': svhn, 'cifar10': cifar10, 'cifar100': cifar100, } # Experiment- and dataset-dependent parameters. zca = True hflip = True epochs = 50 if dataset in {'svhn', 'cifar10'}: if dataset == 'svhn': zca = False hflip = False epochs = 30 nb_classes = 10 elif dataset == 'cifar100': nb_classes = 100 else: raise ValueError('`dataset` must be "svhn", "cifar10", "cifar100".') super_dropout = 0.2 in_network_dropout = 0.0 if network == 'convnet' and dataset == 'svhn': super_dropout = 0.5 in_network_dropout = 0.5 elif network == 'wrn' and dataset == 'svhn': super_dropout = 0.5 # Prepare the dataset. (x_train, y_train), (x_test, y_test) = arg2var[dataset].load_data() x_test = global_contrast_normalize(x_test) x_train = global_contrast_normalize(x_train) if zca: zca_whiten = zca_whitener(x_train) x_train = zca_whiten(x_train) x_test = zca_whiten(x_test) x_test = x_test.reshape((len(x_test), 32, 32, 3)) x_train = x_train.reshape((len(x_train), 32, 32, 3)) if nb_labels in {50000, 73257}: x_labeled = x_train y_labeled = y_train else: labels_per_class = nb_labels // nb_classes sample_inds = stratified_sample(y_train, labels_per_class) x_labeled = x_train[sample_inds] y_labeled = y_train[sample_inds] y_labeled = to_categorical(y_labeled) # Shared training parameters. base_lr = 0.05 batch_size = 16 lr_decay_power = 0.5 input_shape = (32, 32, 3) max_iter = (len(x_train) // batch_size) * epochs # Compile the SESEMI model. sesemi_model, inference_model = compile_sesemi(arg2var[network], input_shape, nb_classes, base_lr, in_network_dropout, super_dropout) print(sesemi_model.summary()) lr_poly_decay = LRScheduler(base_lr, max_iter, lr_decay_power) evaluate = DenseEvaluator(inference_model, (x_test, y_test), hflip, oversample=True) super_datagen = ImageDataGenerator( width_shift_range=[-2, -1, 0, 1, 2], height_shift_range=[-2, -1, 0, 1, 2], horizontal_flip=hflip, preprocessing_function=gaussian_noise, fill_mode='reflect', ) self_datagen = ImageDataGenerator( width_shift_range=[-2, -1, 0, 1, 2], height_shift_range=[-2, -1, 0, 1, 2], horizontal_flip=False, preprocessing_function=gaussian_noise, fill_mode='reflect', ) super_data = super_datagen.flow(x_labeled, y_labeled, shuffle=True, batch_size=1, seed=None) self_data = self_datagen.flow(x_train, shuffle=True, batch_size=1, seed=None) train_data_loader = datagen(super_data, self_data, batch_size) # Fit the SESEMI model on mini-batches with data augmentation. print('Run configuration:') print('network=%s,' % network, 'dataset=%s,' % dataset, \ 'horizontal_flip=%s,' % hflip, 'ZCA=%s,' % zca, \ 'nb_epochs=%d,' % epochs, 'batch_size=%d,' % batch_size, \ 'nb_labels=%d,' % len(y_labeled), 'gpu_id=%s' % args.gpu_id) sesemi_model.fit_generator( train_data_loader, epochs=epochs, verbose=1, steps_per_epoch=len(x_train) // batch_size, callbacks=[lr_poly_decay, evaluate], )
def main(): args = parse_args() network = args.network dataset = args.dataset nb_labels = args.nb_labels os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id) arg2var = {'convnet': convnet, 'wrn': wrn, 'resnet50v2':resnet50v2, 'svhn': svhn, 'cifar10': cifar10, 'cifar100': cifar100,} # Dataset-specific parameters hflip = True zca = True epochs = 10 if dataset in ['svhn', 'cifar10']: if dataset == 'svhn': hflip = False zca = False epochs = 30 nb_classes = 10 elif dataset == 'cifar100': nb_classes = 100 else: raise ValueError('`dataset` must be "svhn", "cifar10", "cifar100".') (x_train, y_train), (x_test, y_test) = arg2var[dataset].load_data() x_train = global_contrast_normalize(x_train) x_test = global_contrast_normalize(x_test) if zca: zca_whiten = zca_whitener(x_train) x_train = zca_whiten(x_train) x_test = zca_whiten(x_test) x_train = x_train.reshape((len(x_train), 32, 32, 3)) x_test = x_test.reshape((len(x_test), 32, 32, 3)) labels_per_class = nb_labels // nb_classes if nb_labels == 73257: labels_per_class = 1000000 sample_inds = stratified_sample(y_train, labels_per_class) x_labeled = x_train[sample_inds] y_labeled = y_train[sample_inds] y_labeled = to_categorical(y_labeled) # Training parameters input_shape = (32, 32, 3) batch_size = 32 base_lr = 0.05 lr_decay_power = 0.5 dropout_rate = 0.2 max_iter = (len(x_train) // batch_size) * epochs sesemi_model, inference_model = open_sesemi( arg2var[network], input_shape, nb_classes, base_lr, dropout_rate) print(sesemi_model.summary()) super_datagen = ImageDataGenerator( width_shift_range=3, height_shift_range=3, horizontal_flip=hflip, preprocessing_function=gaussian_noise, fill_mode='reflect', ) self_datagen = ImageDataGenerator( width_shift_range=3, height_shift_range=3, horizontal_flip=False, preprocessing_function=gaussian_noise, fill_mode='reflect', ) super_data = super_datagen.flow( x_labeled, y_labeled, shuffle=True, batch_size=1, seed=None) self_data = self_datagen.flow( x_train, shuffle=True, batch_size=1, seed=None) train_data_loader = datagen(super_data, self_data, batch_size) lr_poly_decay = LRScheduler(base_lr, max_iter, lr_decay_power) evaluate = DenseEvaluator(inference_model, (x_test, y_test), hflip) # Fit the SESEMI model on mini-batches with data augmentation print('Run configuration:') print('network=%s,' % network, 'dataset=%s,' % dataset, \ 'horizontal_flip=%s,' % hflip, 'ZCA=%s,' % zca, \ 'nb_epochs=%d,' % epochs, 'batch_size=%d,' % batch_size, \ 'nb_labels=%d,' % len(x_labeled), 'gpu_id=%d' % args.gpu_id) sesemi_model.fit_generator(train_data_loader, epochs=epochs, verbose=1, steps_per_epoch=len(x_train) // batch_size, callbacks=[lr_poly_decay, evaluate],) return
def main(): global args, best_acc with open(args.config) as f: try: config = yaml.load(f, Loader=yaml.FullLoader) except: print('###################################################') print('Please update pyyaml >= 5.1') print('###################################################') config = yaml.load(f) for k, v in config['common'].items(): setattr(args, k, v) if args.val_freq is None: args.val_freq = args.save_freq if not os.path.exists(args.save_path): print('Create {}.'.format(args.save_path)) os.makedirs(args.save_path) print('###################################################') print('Parameters') print(args) print('###################################################') # Data loading code transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = CIFAR10 else: dataloader = CIFAR100 train_dataset = dataloader(root=args.train_root, train=True, transform=transform_train) train_loader = data.DataLoader(train_dataset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root=args.val_root, train=False, transform=transform_test) val_loader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) if '18' in args.arch: model = ResNet18() print('Using ResNet18 for training') elif '50' in args.arch: model = ResNet50() print('Using ResNet50 for training') else: model = resnet20() print('Using resnet20 for training') model.cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = AdaXW(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) # Resume ckpt_path = os.path.join(args.save_path, 'ckpt') if not os.path.exists(ckpt_path): print('Create {}.'.format(ckpt_path)) os.makedirs(ckpt_path) weight_file = get_last_weights(ckpt_path + '/ckpt_step*.pth') last_step = 0 if len(weight_file) == 0: print('No ckpt for resuming.') else: # Load checkpoint. print('==> Resuming from {}.'.format(weight_file)) assert os.path.isfile( weight_file), 'Error: no checkpoint directory found!' # checkpoint = torch.load(weight_file) checkpoint = load_state(weight_file, model) model.load_state_dict(checkpoint['state_dict']) if not args.evaluate: last_step = checkpoint['step'] optimizer.load_state_dict(checkpoint['optimizer']) if args.evaluate: print('\nEvaluation only') test_loss, test_top1, test_top5 = validate(val_loader, model, criterion) print(' Test Loss: %.8f, Test Acc.top1: %.2f, Test Acc.top5: %.2f' % (test_loss, test_top1, test_top5)) return # Tensorboard tb_logger = SummaryWriter(args.save_path + '/tf_log') if not os.path.exists(args.save_path + '/tf_log'): print('Create {}.'.format(args.save_path + '/tf_log')) os.makedirs(args.save_path + '/tf_log') T_max = int(len(train_dataset) * args.epochs / args.train_batch) try: T_max = max(T_max, args.max_iter) except: pass if args.lr_multi: lr_multi = float(8) * args.train_batch / (8. * 32) else: lr_multi = 1.0 print('Totally train {} steps.'.format(T_max)) lr_scheduler = LRScheduler(max_steps=T_max, optimizer=optimizer, lr_mult=lr_multi, args=args) # Train and val _ = train(train_loader, val_loader, model, criterion, optimizer, last_step, T_max, lr_scheduler, tb_logger)
def main(**kwargs): # 1. Parse command line arguments. opt._parse(kwargs) # 2. Visdom # vis = Visualizer(env=opt.env) # 3. GPU settings # n_gpu = utils.set_gpu('0,1') # 4. Configure model logging.info('==> Traing model for clothing type: {}'.format(opt.category)) cudnn.benchmark = True net = getattr(models, opt.model)(opt) # 5. Initialize logger cur_time = time.strftime('%Y-%m-%dT%H:%M:%S', timm.localtime()) initialize_logger(f'{opt.category}_{opt.model}_{cur_time}') # 6. Initialize checkpoints directory lr = opt.lr start_epoch = 1 best_val_loss = float('inf') if opt.load_checkpoint_path: logging.info('==> Resuming from checkpoint...') checkpoint = torch.load(opt.load_checkpoint_path) start_epoch = checkpoint['epoch'] + 1 lr = checkpoint['lr'] best_val_loss = checkpoint['best_val_loss'] net.load_state_dict(checkpoint['state_dict']) # 7. Data setup train_dataset = FashionAIKeypoints(opt, phase='train') logging.info('Train sample number: {}'.format(len(train_dataset))) train_loader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, collate_fn=train_dataset.collate_fn, pin_memory=True) val_dataset = FashionAIKeypoints(opt, phase='val') logging.info('Val sample number: {}'.format(len(val_dataset))) val_loader = DataLoader(val_dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, collate_fn=val_dataset.collate_fn, pin_memory=True) net = net.cuda() # net = DataParallel(net) loss = CPNLoss() loss = loss.cuda() # 8. Loss, optimizer and LR scheduler optimizer = torch.optim.SGD(net.parameters(), lr, momentum=0.9, weight_decay=1e-4) lrs = LRScheduler(lr, patience=3, factor=0.1, min_lr=0.01 * lr, best_loss=best_val_loss) # 9. Training loop for epoch in range(start_epoch, opt.max_epochs + 1): # Training logging.info("Start training loop...") train_metrics, train_time = train(train_loader, net, loss, optimizer, lr) # Validating logging.info("Start validating loop...") with torch.no_grad(): val_metrics, val_time = validate(val_loader, net, loss) log_model(epoch, lr, train_metrics, train_time, val_metrics, val_time) val_loss = np.mean(val_metrics[:, 0]) lr = lrs.update_by_rule(val_loss) # Save checkpoints if val_loss < best_val_loss or epoch % 10 == 0 or lr is None: if val_loss < best_val_loss: best_val_loss = val_loss state_dict = net.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save( { 'epoch': epoch, 'save_dir': opt.checkpoint_path, 'state_dict': state_dict, 'lr': lr, 'best_val_loss': best_val_loss }, opt.checkpoint_path / 'kpt_{}_{:03d}.ckpt'.format(opt.category, epoch)) if lr is None: logging.info('Training is early-stopped') break
def main(): global args, best_prec1 args = parser.parse_args() with open(args.config) as f: config = yaml.load(f) for key in config: for k, v in config[key].items(): setattr(args, k, v) proc = subprocess.Popen( 'mmdownload -f '+args.f+' -n '+args.n+' -o ./models/'+args.f+'_'+args.n+'/' , shell=True, executable='/bin/bash') proc.communicate() proc = subprocess.Popen( 'mmconvert -sf '+args.srcFramework+' --inputShape 224,224,3 -in models/'+args.f+'_'+args.n+'/imagenet_'+args.n+'.ckpt.meta -iw models/'+args.f+'_'+args.n+'/imagenet_'+args.n+'.ckpt --dstNodeName MMdnn_Output -df '+args.dstFramework+' -om models/'+args.n+'.pth' , shell=True, executable='/bin/bash') proc.communicate() MainModel = imp.load_source('MainModel', "models/"+args.model+".py") model = torch.load("models/"+args.n+".pth") if args.cuda: model = torch.nn.DataParallel(model).cuda() criterion = nn.CrossEntropyLoss().cuda() else: model=model.cpu() criterion = nn.CrossEntropyLoss().cpu() # define loss function (criterion) and optimizer optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # auto resume from a checkpoint model_dir = args.model_dir start_epoch = 0 if not os.path.exists(model_dir) : os.makedirs(model_dir) if args.evaluate: pass else: best_prec1, start_epoch = utils.load_state(model_dir, model, optimizer=optimizer) writer = SummaryWriter(model_dir) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, 0, writer) return train_dataset_multi_scale = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), # ColorAugmentation(), normalize, ])) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), #ColorAugmentation(), normalize, ])) train_sampler = None train_loader_multi_scale = torch.utils.data.DataLoader( train_dataset_multi_scale, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) niters = len(train_loader) lr_scheduler = LRScheduler(optimizer, niters, args) for epoch in range(start_epoch, args.epochs): # train for one epoch if epoch < args.epochs - 5: train(train_loader_multi_scale, model, criterion, optimizer, lr_scheduler, epoch, writer) else: train(train_loader, model, criterion, optimizer, lr_scheduler, epoch, writer) # evaluate on validation set prec1 = validate(val_loader, model, criterion, epoch, writer) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) utils.save_checkpoint(model_dir, { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)