def train(): global GLOBAL_STEP, reduction_arc, cell_arc # Dataset dataset = cifarDataset( batchSize=args.batch_size, dataPath=args.data_path, numOfWorkers=args.data_nums_workers, noise_rate=args.nr, is_cifar100=args.train_cifar100, filename=args.fn, ) dataLoader = dataset.getDataLoader() if args.train_cifar100: num_classes = 100 fixed_cnn = ResNet34(num_classes=num_classes) else: num_classes = 10 fixed_cnn = SCEModel() if args.loss == 'SCE': if args.train_cifar100: criterion = SCELoss(alpha=6.0, beta=0.1, num_classes=num_classes) else: criterion = SCELoss(alpha=0.1, beta=1.0, num_classes=num_classes) elif args.loss == 'CE': criterion = torch.nn.CrossEntropyLoss() else: print("Unknown loss") print(criterion.__class__.__name__) print("Number of Trainable Parameters %.4f" % count_parameters_in_MB(fixed_cnn)) fixed_cnn = torch.nn.DataParallel(fixed_cnn) fixed_cnn.to(device) fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay=args.l2_reg) if args.train_cifar100: milestone = [80, 120] else: milestone = [40, 80] fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestone, gamma=0.1) utilHelper = TrainUtil(checkpoint_path=args.checkpoint_path, version=args.version) starting_epoch = 0 train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
def train(): global GLOBAL_STEP, reduction_arc, cell_arc # Dataset dataset = DatasetGenerator(batchSize=args.batch_size, dataPath=args.data_path, numOfWorkers=args.data_nums_workers, noise_rate=args.nr, asym=args.asym, seed=args.seed, dataset_type=args.dataset_type) dataLoader = dataset.getDataLoader() if args.dataset_type == 'cifar100': num_classes = 100 args.epoch = 150 fixed_cnn = ResNet34(num_classes=num_classes) elif args.dataset_type == 'cifar10': num_classes = 10 args.epoch = 120 fixed_cnn = SCEModel() else: raise ('Unimplemented') if args.loss == 'SCE': criterion = SCELoss(alpha=args.alpha, beta=args.beta, num_classes=num_classes) elif args.loss == 'CE': criterion = torch.nn.CrossEntropyLoss() else: logger.info("Unknown loss") logger.info(criterion.__class__.__name__) logger.info("Number of Trainable Parameters %.4f" % count_parameters_in_MB(fixed_cnn)) fixed_cnn = torch.nn.DataParallel(fixed_cnn) fixed_cnn.to(device) fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay=args.l2_reg) fixed_cnn_scheduler = torch.optim.lr_scheduler.StepLR(fixed_cnn_optmizer, 1, gamma=0.97) utilHelper = TrainUtil(checkpoint_path=args.checkpoint_path, version=args.version) starting_epoch = 0 train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
def build_BSD_500(model_state_dict, optimizer_state_dict, **kwargs): # epoch = kwargs.pop('epoch') # i_iter = kwargs.pop('i_iter') root = "./data/HED-BSDS" train_data = dataloader_BSD_Pascal.BSD_loader(root=root, split='train', normalisation=False, keep_size=False) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, pin_memory=True, num_workers=16, shuffle=True) # model = DeepLab(output_stride=16, class_num=2, pretrained=False, freeze_bn=False) # model = NASUNetBSD(args, args.classes, depth=args.layers, c=args.channels, # keep_prob=args.keep_prob, nodes=args.nodes, # use_aux_head=args.use_aux_head, arch=args.arch, # double_down_channel=args.double_down_channel) model = NAOMSCBC(args, args.classes, args.arch, channels=42, pretrained=True, res='101') logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) if model_state_dict is not None: model.load_state_dict(model_state_dict) if torch.cuda.device_count() > 1: logging.info("Use %d %s", torch.cuda.device_count(), "GPUs !") model = nn.DataParallel(model) model = model.cuda() # optimizer = torch.optim.SGD( # model.parameters(), # lr=args.lr_max, # momentum=0.9, # weight_decay=args.l2_reg, # ) # if optimizer_state_dict is not None: # optimizer.load_state_dict(optimizer_state_dict) # # return train_queue, model, optimizer return train_queue, model
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) model = model.cuda() utils.load(model, args.model_path) logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() _, test_transform = utils._data_transforms_cifar10(args) test_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=test_transform) test_queue = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) model.drop_path_prob = args.drop_path_prob test_acc, test_obj = infer(test_queue, model, criterion) logging.info('test_acc %f', test_acc)
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) # set visible GPU ids if len(args.gpu_ids) > 0: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_ids # set TensorFlow environment for evaluation (calculate IS and FID) _init_inception() inception_path = check_or_download_inception('./tmp/imagenet/') create_inception_graph(inception_path) # the first GPU in visible GPUs is dedicated for evaluation (running Inception model) str_ids = args.gpu_ids.split(',') args.gpu_ids = [] for id in range(len(str_ids)): if id >= 0: args.gpu_ids.append(id) if len(args.gpu_ids) > 1: args.gpu_ids = args.gpu_ids[1:] else: args.gpu_ids = args.gpu_ids # genotype G genotypes_root = os.path.join('exps', args.genotypes_exp, 'Genotypes') genotype_G = np.load(os.path.join(genotypes_root, 'latest_G.npy')) # import network from genotype basemodel_gen = eval('archs.' + args.arch + '.Generator')(args, genotype_G) gen_net = torch.nn.DataParallel( basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args) dis_net = torch.nn.DataParallel( basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) # basemodel_gen = eval('archs.' + args.arch + '.Generator')(args=args) # gen_net = torch.nn.DataParallel(basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) # basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args=args) # dis_net = torch.nn.DataParallel(basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # epoch number for dis_net args.max_epoch_D = args.max_epoch_G * args.n_critic if args.max_iter_G: args.max_epoch_D = np.ceil(args.max_iter_G * args.n_critic / len(train_loader)) max_iter_D = args.max_epoch_D * len(train_loader) # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, max_iter_D) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, max_iter_D) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' else: raise NotImplementedError(f'no fid stat for {args.dataset.lower()}') assert os.path.exists(fid_stat) # initial gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 # set writer if args.checkpoint: # resuming print(f'=> resuming from {args.checkpoint}') assert os.path.exists(os.path.join('exps', args.checkpoint)) checkpoint_file = os.path.join('exps', args.checkpoint, 'Model', 'checkpoint_best.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info( f'=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})') else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('exps', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # model size logger.info('Param size of G = %fMB', count_parameters_in_MB(gen_net)) logger.info('Param size of D = %fMB', count_parameters_in_MB(dis_net)) print_FLOPs(basemodel_gen, (1, args.latent_dim), logger) print_FLOPs(basemodel_dis, (1, 3, args.img_size, args.img_size), logger) # for visualization if args.draw_arch: from utils.genotype import draw_graph_G draw_graph_G(genotype_G, save=True, file_path=os.path.join(args.path_helper['graph_vis_path'], 'latest_G')) fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (100, args.latent_dim))) # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch_D)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) if epoch % args.val_freq == 0 or epoch == int(args.max_epoch_D) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, std, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict) logger.info( f'Inception score mean: {inception_score}, Inception score std: {std}, ' f'FID score: {fid_score} || @ epoch {epoch}.') load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False # save model avg_gen_net = deepcopy(gen_net) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'model': args.arch, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path']) del avg_gen_net
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) # set visible GPU ids if len(args.gpu_ids) > 0: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_ids # set TensorFlow environment for evaluation (calculate IS and FID) _init_inception() inception_path = check_or_download_inception('./tmp/imagenet/') create_inception_graph(inception_path) # the first GPU in visible GPUs is dedicated for evaluation (running Inception model) str_ids = args.gpu_ids.split(',') args.gpu_ids = [] for id in range(len(str_ids)): if id >= 0: args.gpu_ids.append(id) if len(args.gpu_ids) > 1: args.gpu_ids = args.gpu_ids[1:] else: args.gpu_ids = args.gpu_ids # genotype G genotypes_root = os.path.join('exps', args.genotypes_exp, 'Genotypes') genotype_G = np.load(os.path.join(genotypes_root, 'latest_G.npy')) # import network from genotype basemodel_gen = eval('archs.' + args.arch + '.Generator')(args, genotype_G) gen_net = torch.nn.DataParallel( basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args) dis_net = torch.nn.DataParallel( basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' else: raise NotImplementedError(f'no fid stat for {args.dataset.lower()}') assert os.path.exists(fid_stat) # set writer print(f'=> resuming from {args.checkpoint}') assert os.path.exists(os.path.join('exps', args.checkpoint)) checkpoint_file = os.path.join('exps', args.checkpoint, 'Model', 'checkpoint_best.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) epoch = checkpoint['epoch'] - 1 gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net assert args.exp_name args.path_helper = set_log_dir('exps', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(f'=> loaded checkpoint {checkpoint_file} (epoch {epoch})') logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'valid_global_steps': epoch // args.val_freq, } # model size logger.info('Param size of G = %fMB', count_parameters_in_MB(gen_net)) logger.info('Param size of D = %fMB', count_parameters_in_MB(dis_net)) print_FLOPs(basemodel_gen, (1, args.latent_dim), logger) print_FLOPs(basemodel_dis, (1, 3, args.img_size, args.img_size), logger) # for visualization if args.draw_arch: from utils.genotype import draw_graph_G draw_graph_G(genotype_G, save=True, file_path=os.path.join(args.path_helper['graph_vis_path'], 'latest_G')) fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (100, args.latent_dim))) # test load_params(gen_net, gen_avg_param) inception_score, std, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict) logger.info( f'Inception score mean: {inception_score}, Inception score std: {std}, ' f'FID score: {fid_score} || @ epoch {epoch}.')
def train(): # Dataset if args.dataset_type == 'clothing1m': dataset = Clothing1MDatasetLoader(batchSize=args.batch_size, dataPath=args.data_path, numOfWorkers=args.data_nums_workers) elif args.dataset_type == 'imagenet': dataset = ImageNetDatasetLoader(batchSize=args.batch_size, dataPath=args.data_path, seed=args.seed, target_class_num=200, nosiy_rate=0.4, numOfWorkers=args.data_nums_workers) else: dataset = DatasetGenerator(batchSize=args.batch_size, dataPath=args.data_path, numOfWorkers=args.data_nums_workers, noise_rate=args.nr, asym=args.asym, seed=args.seed, dataset_type=args.dataset_type) dataLoader = dataset.getDataLoader() eta_min = 0 ln_neg = 1 if args.dataset_type == 'clothing1m': # Train Clothing1M args.epoch = 20 args.l2_reg = 1e-3 num_classes = 14 fixed_cnn = torchvision.models.resnet50(num_classes=14) # fixed_cnn.fc = torch.nn.Linear(2048, 14) elif args.dataset_type == 'cifar100': # Train CIFAR100 args.lr = 0.1 args.epoch = 200 num_classes = 100 fixed_cnn = ResNet34(num_classes=num_classes) # NLNL if args.loss == 'NLNL': args.epoch = 2000 ln_neg = 110 elif args.dataset_type == 'cifar10': # Train CIFAR10 args.epoch = 120 num_classes = 10 fixed_cnn = SCEModel(type='cifar10') # NLNL if args.loss == 'NLNL': args.epoch = 1000 elif args.dataset_type == 'mnist': # Train mnist args.epoch = 50 num_classes = 10 fixed_cnn = SCEModel(type='mnist') eta_min = 0.001 args.l2_reg = 1e-3 # NLNL if args.loss == 'NLNL': args.epoch = 720 elif args.dataset_type == 'imagenet': args.epoch = 100 args.l2_reg = 3e-5 num_classes = 200 fixed_cnn = torchvision.models.resnet50(num_classes=num_classes) logger.info("num_classes: %s" % (num_classes)) loss_options = { 'SCE': SCELoss(alpha=args.alpha, beta=args.beta, num_classes=num_classes), 'CE': torch.nn.CrossEntropyLoss(), 'NCE': NormalizedCrossEntropy(scale=args.alpha, num_classes=num_classes), 'MAE': MeanAbsoluteError(scale=args.alpha, num_classes=num_classes), 'NMAE': NormalizedMeanAbsoluteError(scale=args.alpha, num_classes=num_classes), 'GCE': GeneralizedCrossEntropy(num_classes=num_classes, q=args.q), 'RCE': ReverseCrossEntropy(scale=args.alpha, num_classes=num_classes), 'NRCE': NormalizedReverseCrossEntropy(scale=args.alpha, num_classes=num_classes), 'NGCE': NormalizedGeneralizedCrossEntropy(scale=args.alpha, num_classes=num_classes, q=args.q), 'NCEandRCE': NCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes), 'NCEandMAE': NCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes), 'GCEandMAE': GCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q), 'GCEandRCE': GCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q), 'GCEandNCE': GCEandNCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q), 'MAEandRCE': MAEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes), 'NGCEandNCE': NGCEandNCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q), 'NGCEandMAE': NGCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q), 'NGCEandRCE': NGCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q), 'FocalLoss': FocalLoss(gamma=args.gamma), 'NFL': NormalizedFocalLoss(scale=args.alpha, gamma=args.gamma, num_classes=num_classes), 'NLNL': NLNL(num_classes=num_classes, train_loader=dataLoader['train_dataset'], ln_neg=ln_neg), 'NFLandNCE': NFLandNCE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes), 'NFLandMAE': NFLandMAE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes), 'NFLandRCE': NFLandRCE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes), 'DMI': DMILoss(num_classes=num_classes) } if args.loss in loss_options: criterion = loss_options[args.loss] else: raise("Unknown loss") logger.info(criterion.__class__.__name__) logger.info("Number of Trainable Parameters %.4f" % count_parameters_in_MB(fixed_cnn)) fixed_cnn.to(device) if args.loss == 'DMI': criterion = loss_options['CE'] fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.l2_reg) fixed_cnn_scheduler = CosineAnnealingLR(fixed_cnn_optmizer, float(args.epoch), eta_min=eta_min) if args.dataset_type == 'clothing1m': fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestones=[5, 10], gamma=0.1) elif args.dataset_type == 'imagenet': fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestones=[30, 60, 80], gamma=0.1) utilHelper = TrainUtil(checkpoint_path=args.checkpoint_path, version=args.version) starting_epoch = 0 for arg in vars(args): logger.info("%s: %s" % (arg, getattr(args, arg))) train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper) if args.loss == 'DMI': criterion = loss_options['DMI'] fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(), lr=1e-6, momentum=0.9, weight_decay=args.l2_reg) starting_epoch = 0 fixed_cnn_scheduler = None train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
def main(): args = parse_args() reset_config(config, args) # tensorboard logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg, 'train', 'train') # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED torch.backends.cudnn.benchmark = True model = Network(config, gt.DARTS) model.init_weights() writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } logger.info("param size = %fMB", count_parameters_in_MB(model)) #dump_input = torch.rand( # (1, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0]) #) #logger.info(get_model_summary(model, dump_input)) gpus = [int(i) for i in config.GPUS.split(',')] criterion = JointsMSELoss(use_target_weight = config.LOSS.USE_TARGET_WEIGHT).to(device) model = nn.DataParallel(model, device_ids=gpus).to(device) logger.info("Logger is set - training start") # weights optimizer optimizer = torch.optim.Adam(model.parameters(), config.TRAIN.LR) # prepare dataloader normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.TRAIN.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.'+config.DATASET.DATASET)( conifg, config.DATASET.ROOT, config.TRAIN.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN.BATCH_SIZE*len(gpus), shuffle=True, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.TRAIN.BATCH_SIZE*len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # training loop best_top1 = 0. best_model = False for epoch in range(config.TRAIN.EPOCHS): # training train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) # validation top1 = validate( config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict ) # save if best_top1 < top1: best_top1 = top1 best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': best_top1, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) lr_scheduler.step() final_model_state_file = os.path.join( final_output_dir, 'final_state.pth' ) logger.info('=> saving final model state to {}'.format( final_model_state_file) ) logger.info('=> best accuracy is {}'.format(best_top1)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled=True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) num_gpus = torch.cuda.device_count() genotype = eval("core.genotypes.%s" % args.arch) print('---------Genotype---------') logging.info(genotype) print('--------------------------') model = Network(args.init_channels, args.input_channels, num_classes, args.layers, args.auxiliary, genotype) if num_gpus > 1: model = nn.DataParallel(model) model = model.cuda() else: model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(num_classes, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD( model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) data_augmentations = args.data_aug if data_augmentations is None: data_augmentations = transforms.ToTensor() elif isinstance(type(data_augmentations), list): data_augmentations = transforms.Compose(data_augmentations) elif not isinstance(data_augmentations, transforms.Compose): raise NotImplementedError # Dataset train_data = K49(args.data_dir, True, data_augmentations) test_data = K49(args.data_dir, False, data_augmentations) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader( test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs)) best_acc_top1 = 0 for epoch in range(args.epochs): if args.lr_scheduler == 'cosine': scheduler.step() current_lr = scheduler.get_lr()[0] elif args.lr_scheduler == 'linear': current_lr = adjust_lr(optimizer, epoch) else: print('Wrong lr type, exit') sys.exit(1) logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) if num_gpus > 1: model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs else: model.drop_path_prob = args.drop_path_prob * epoch / args.epochs epoch_start = time.time() train_acc, train_obj = train(train_queue, model, criterion_smooth, optimizer) logging.info('Train_acc: %f', train_acc) valid_acc_top1, valid_obj = infer(valid_queue, model, criterion) logging.info('Valid_acc_top1: %f', valid_acc_top1) epoch_duration = time.time() - epoch_start logging.info('Epoch time: %ds.', epoch_duration) is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True utils.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer' : optimizer.state_dict(), }, is_best, log_path)
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) genotype = eval("core.genotypes.%s" % args.arch) #if args.set == "KMNIST": # model = NetworkKMNIST(args.init_channels, args.input_channels, num_classes, args.layers, args.auxiliary, genotype) #elif args.set == "K49": model = Network(args.init_channels, args.input_channels, num_classes, args.layers, args.auxiliary, genotype) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # Data augmentations train_transform, valid_transform = utils.data_transforms_Kuzushiji(args) # Dataset if args.set == "KMNIST": train_data = KMNIST(args.data_dir, True, train_transform) test_data = KMNIST(args.data_dir, False, valid_transform) elif args.set == "K49": train_data = K49(args.data_dir, True, train_transform) test_data = K49(args.data_dir, False, valid_transform) else: raise ValueError("Unknown Dataset %s" % args.dataset) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) best_acc = 0.0 for epoch in range(args.epochs): scheduler.step() logging.info('epoch %d/%d lr %e', epoch, args.epochs, scheduler.get_lr()[0]) genotype = eval("core.genotypes.%s" % args.arch) print('---------Genotype---------') logging.info(genotype) print('--------------------------') model.drop_path_prob = args.drop_path_prob * epoch / args.epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer) logging.info('train_acc %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) if valid_acc > best_acc: best_acc = valid_acc logging.info('valid_acc %f, best_acc %f', valid_acc, best_acc) utils.save(model, os.path.join(log_path, 'weights.pt'))