def main_worker(local_rank, args): args.local_rank = local_rank # prepare dist environment dist.init_process_group(backend='nccl', rank=args.local_rank, world_size=args.world_size) torch.cuda.set_device(args.local_rank) network = ShuffleNetV1(input_size=cfg.image_size, n_class=cfg.num_classes, model_size='2.0x', group=3) network = network.cuda() criterion = CrossEntropyLabelSmooth(cfg.num_classes, 0.1) optimizer = optim.SGD(network.parameters(), lr=cfg.lr_init, momentum=cfg.SGD_momentum, weight_decay=cfg.SGD_weight_decay) dataloader_train = create_dataset_pytorch_imagenet_dist_train( data_path=args.data_path+'train', local_rank=local_rank, n_workers=cfg.n_workers) dataloader_test = create_dataset_pytorch_imagenet(data_path=args.data_path+'val', is_train=False, n_workers=cfg.n_workers) step_per_epoch = len(dataloader_train) total_iters = step_per_epoch * cfg.epoch_size scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda step : (1.0 - step * 1.0 / total_iters) if step <= total_iters else 0, last_epoch=-1) summary_writer = None if local_rank == 0: summary_writer = SummaryWriter(log_dir='./summary') trainer = Trainer(network=network, criterion=criterion, optimizer=optimizer, scheduler=scheduler, dataloader_train=dataloader_train, dataloader_test=dataloader_test, summary_writer=summary_writer, epoch_size=cfg.epoch_size, ckpt_path=args.ckpt_path, local_rank=local_rank) for epoch_id in range(cfg.epoch_size): trainer.step() if local_rank == 0: summary_writer.close()
def __init__(self, context: det.TrialContext) -> None: self.context = context self.data_config = context.get_data_config() self.criterion = CrossEntropyLabelSmooth( context.get_hparam("num_classes"), # num classes context.get_hparam("label_smoothing_rate"), ) self.last_epoch_idx = -1
def __init__(self, context: PyTorchTrialContext) -> None: self.context = context self.data_config = context.get_data_config() self.criterion = CrossEntropyLabelSmooth( context.get_hparam("num_classes"), # num classes context.get_hparam("label_smoothing_rate"), ) self.last_epoch_idx = -1 self.model = self.context.wrap_model(self.build_model_from_config()) self.optimizer = self.context.wrap_optimizer( torch.optim.SGD( self.model.parameters(), lr=self.context.get_hparam("learning_rate"), momentum=self.context.get_hparam("momentum"), weight_decay=self.context.get_hparam("weight_decay"), )) self.lr_scheduler = self.context.wrap_lr_scheduler( self.build_lr_scheduler_from_config(self.optimizer), step_mode=LRScheduler.StepMode.STEP_EVERY_EPOCH, )
def train(self): torch.multiprocessing.set_sharing_strategy('file_system') args = arg_parser() logger = log() model_root = './model_source' if not os.path.exists(model_root): os.mkdir(model_root) time_stamp_launch = time.strftime('%Y%m%d') + '-' + time.strftime( '%H%M') os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu n_gpus = len(args.gpu.split(',')) # set parameters path = args.data_root label_file = args.label_file batch_size = args.batchsize epochs = args.max_epoch best_acc = 0 dataset_name = path.split('/')[-2] logger.info( path.split('/')[-2] + '_' + time_stamp_launch + 'model : resnet101 lr: %s' % args.lr) logger.info('dataset is: ' + dataset_name) net = resnet101(pretrained=True) input_dim = net.fc.in_features net.fc = weightNorm(nn.Linear(input_dim, 12), name="weight") net = net.cuda() param_group = [] for k, v in net.named_parameters(): if k[:2] == 'fc': param_group += [{'params': v, 'lr': args.lr}] else: param_group += [{'params': v, 'lr': args.lr * 0.1}] loss = CrossEntropyLabelSmooth(num_classes=12).cuda() optimizer = optim.SGD(param_group, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=args.MultiStepLR, gamma=0.1) # training dataset transform_train = transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # grayscale mean/std ]) # train_dataset = AsoctDataset(path, label_file, args.imgs_per_volume, train=True, transform=transform_train) train_dataset = visDataset(path, label_file, train=True, transform=transform_train) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=2 * n_gpus if n_gpus <= 2 else 2) transform_test = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # grayscale mean/std ]) val_dataset = visDataset(path, label_file, train=False, transform=transform_test) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2 * n_gpus if n_gpus <= 2 else 2) for i in range(epochs): accnum = 0.0 total = 0.0 running_loss = [] net.train() for j, (img_data, img_label, ind) in enumerate(train_loader): img_data = img_data.cuda() img_label = img_label.cuda() r_loss, correct_num, bs_num = self.train_process( net, optimizer, img_data, img_label, loss) running_loss += [r_loss] total += bs_num accnum += correct_num scheduler.step() avg_loss = np.mean(running_loss) temp_acc = 100 * np.float(accnum) / np.float(total) logger.info("Epoch %d running_loss=%.3f" % (i + 1, avg_loss)) logger.info( "Accuracy of the prediction on the train dataset : %f %%" % (temp_acc)) # valuate the model acc = val_source(net, val_loader) if acc >= best_acc: logger.info('saving the best model!') torch.save( net, './model_source/' + time_stamp_launch + '-' + dataset_name + '9_1_resnet50_best.pkl') best_acc = acc else: torch.save( net, './model_source/' + time_stamp_launch + '-' + dataset_name + '9_1_resnet50_last.pkl') logger.info('best acc is : %.04f, acc is : %.04f' % (best_acc, acc)) logger.info('================================================') logger.info("Finished Training")
def main(): #LOAD CONFIGS################################################################ args = get_args() import os os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_no log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler(os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True # cudnn.enabled=True torch.cuda.manual_seed(str(args.rand_seed)) random.seed(args.rand_seed) #LOAD DATA################################################################### def convert_param(original_lists): ctype, value = original_lists[0], original_lists[1] is_list = isinstance(value, list) if not is_list: value = [value] outs = [] for x in value: if ctype == 'int': x = int(x) elif ctype == 'str': x = str(x) elif ctype == 'bool': x = bool(int(x)) elif ctype == 'float': x = float(x) elif ctype == 'none': if x.lower() != 'none': raise ValueError('For the none type, the value must be none instead of {:}'.format(x)) x = None else: raise TypeError('Does not know this type : {:}'.format(ctype)) outs.append(x) if not is_list: outs = outs[0] return outs if args.dataset == 'cifar100': mean = [x / 255 for x in [129.3, 124.1, 112.4]] std = [x / 255 for x in [68.2, 65.4, 70.4]] lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), transforms.Normalize(mean, std)] transform_train = transforms.Compose(lists) transform_test = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) with open('../data/cifar-split.txt', 'r') as f: data = json.load(f) content = { k: convert_param(v) for k,v in data.items()} Arguments = namedtuple('Configure', ' '.join(content.keys())) content = Arguments(**content) cifar_split = content train_split, valid_split = cifar_split.train, cifar_split.valid print(len(train_split),len(valid_split)) train_dataset = datasets.CIFAR100(root='../data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False,sampler=torch.utils.data.sampler.SubsetRandomSampler(train_split), num_workers=4, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) val_loader = torch.utils.data.DataLoader( datasets.CIFAR100(root='../data', train=True, download=True, transform=transform_test), batch_size=250, shuffle=False, sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split), num_workers=4, pin_memory=use_gpu ) val_dataprovider = DataIterator(val_loader) print('load data successfully') CLASS = 100 elif args.dataset == 'cifar10': mean = [x / 255 for x in [125.3, 123.0, 113.9]] std = [x / 255 for x in [63.0, 62.1, 66.7]] lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), transforms.Normalize(mean, std)] transform_train = transforms.Compose(lists) transform_test = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) with open('../data/cifar-split.txt', 'r') as f: data = json.load(f) content = { k: convert_param(v) for k,v in data.items()} Arguments = namedtuple('Configure', ' '.join(content.keys())) content = Arguments(**content) cifar_split = content train_split, valid_split = cifar_split.train, cifar_split.valid print(len(train_split),len(valid_split)) train_dataset = datasets.CIFAR10(root='../data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False,sampler=torch.utils.data.sampler.SubsetRandomSampler(train_split), num_workers=4, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) val_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root='../data', train=True, download=True, transform=transform_test), batch_size=250, shuffle=False, sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split), num_workers=4, pin_memory=use_gpu ) val_dataprovider = DataIterator(val_loader) print('load data successfully') CLASS = 10 elif args.dataset == 'image16': mean = [x / 255 for x in [122.68, 116.66, 104.01]] std = [x / 255 for x in [63.22, 61.26 , 65.09]] transform_test = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) with open('../data/ImageNet16-120-split.txt', 'r') as f: data = json.load(f) content = { k: convert_param(v) for k,v in data.items()} Arguments = namedtuple('Configure', ' '.join(content.keys())) content = Arguments(**content) img_split = content train_split, valid_split = img_split.train, img_split.valid train_split = train_split[:len(train_split)//args.batch_size*args.batch_size] valid_split = valid_split[:len(valid_split)//250*250] print(len(train_split),len(valid_split)) train_dataset = ImageNet16('../data', True , transform_test,120) test_dataset = ImageNet16('../data', True, transform_test,120) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False,sampler=torch.utils.data.sampler.SubsetRandomSampler(train_split), num_workers=4, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) val_loader = torch.utils.data.DataLoader( test_dataset, batch_size=250, shuffle=False, sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split), num_workers=4, pin_memory=use_gpu ) val_dataprovider = DataIterator(val_loader) print('load data successfully') CLASS = 120 print(CLASS) print(args.init_channels,args.stacks//3) model = TinyNetwork(C=args.init_channels,N=args.stacks//3,max_nodes = 4, num_classes = CLASS, search_space = NAS_BENCH_201, affine = False, track_running_stats = False).cuda() optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(CLASS, 0.1) if use_gpu: loss_function = criterion_smooth.cuda() device = torch.device("cuda" ) else: loss_function = criterion_smooth device = torch.device("cpu") scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,args.total_iters) model = model.to(device) all_iters = 0 if args.auto_continue: lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) print('load from checkpoint') for i in range(iters): scheduler.step() args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider args.evo_controller = evolutionary(args.max_population,args.select_number, args.mutation_len,args.mutation_number,args.p_opwise,args.evo_momentum) path = './record_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(args.dataset,args.stacks,args.init_channels,args.total_iters,args.warmup_iters,args.max_population,args.select_number,args.mutation_len,args.mutation_number,args.val_interval,args.p_opwise,args.evo_momentum,args.rand_seed) logging.info(path) model.current_N = 1 while all_iters < args.total_iters: if all_iters in [15000,30000,45000,60000]: # if all_iters in [50,100,150,200]: # print("----------") model.current_N += 1 if all_iters > 1 and all_iters % args.val_interval == 0: results = [] for structure_father in args.evo_controller.group: results.append([structure_father.structure,structure_father.loss,structure_father.count]) if not os.path.exists(path): os.mkdir(path) with open(path + '/%06d-ep.txt'%all_iters,'w') as tt: json.dump(results,tt) if all_iters >= args.warmup_iters:#warmup args.evo_controller.select() all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters) validate(model, device, args, all_iters=all_iters) results = [] for structure_father in args.evo_controller.group: results.append([structure_father.structure,structure_father.loss,structure_father.count]) with open(path + '/%06d-ep.txt'%all_iters,'w') as tt: json.dump(results,tt)
parser.add_argument("--test-batch-size", type=int, default=512) parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() # use a fixed set of image will improve the performance torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True assert torch.cuda.is_available() model = ShuffleNetV2OneShot() criterion = CrossEntropyLabelSmooth(1000, 0.1) get_and_apply_next_architecture(model) model.load_state_dict(load_and_parse_state_dict(filepath=args.checkpoint)) model.cuda() train_loader = get_imagenet_iter_dali( "train", args.imagenet_dir, args.train_batch_size, args.workers, spos_preprocessing=args.spos_preprocessing, seed=args.seed, device_id=0) val_loader = get_imagenet_iter_dali( "val", args.imagenet_dir,
type=str, default="./checkpoint", help='path where the checkpoint to be saved') parser.add_argument('--device_id', type=int, default=0, help='device id of GPU. (Default: 0)') args = parser.parse_args() device = torch.device('cuda:' + str(args.device_id)) network = ShuffleNetV1(input_size=cfg.image_size, n_class=cfg.num_classes, model_size='2.0x', group=3) network.to(device) criterion = CrossEntropyLabelSmooth(cfg.num_classes, 0.1) optimizer = optim.SGD(network.parameters(), lr=cfg.lr_init, momentum=cfg.SGD_momentum, weight_decay=cfg.SGD_weight_decay) dataloader_train = create_dataset_pytorch_cifar10(args.data_path) dataloader_test = create_dataset_pytorch_cifar10(args.data_path, is_train=False) total_iters = len(dataloader_train) * cfg.epoch_size scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda step: (1.0 - step * 1.0 / total_iters) if step <= total_iters else 0, last_epoch=-1) summary_writer = SummaryWriter(log_dir='./summary')
def prepare(args, RCV_CONFIG): args.momentum = RCV_CONFIG['momentum'] args.bn_process = True if RCV_CONFIG['bn_process'] == 'True' else False args.learning_rate = RCV_CONFIG['learning_rate'] args.weight_decay = RCV_CONFIG['weight_decay'] args.label_smooth = RCV_CONFIG['label_smooth'] args.lr_scheduler = RCV_CONFIG['lr_scheduler'] args.randAugment = True if RCV_CONFIG['randAugment'] == 'True' else False # if RCV_CONFIG['momentum'] == 'vgg': # net = VGG('VGG19') # if RCV_CONFIG['model'] == 'resnet18': # net = ResNet18() # if RCV_CONFIG['model'] == 'googlenet': # net = GoogLeNet() use_gpu = False if torch.cuda.is_available(): use_gpu = True if args.cifar100: # train_dataprovider, val_dataprovider, train_step, valid_step = dataset_cifar.get_dataset("cifar100", batch_size=args.batch_size, RandA=args.randAugment) train_dataprovider, val_dataprovider, train_step, valid_step = dataset_cifar.get_dataset( "cifar10", batch_size=args.batch_size, RandA=args.randAugment) print('load data successfully') else: assert os.path.exists(args.train_dir) from dataset import DataIterator, SubsetSampler, OpencvResize, ToBGRTensor train_dataset = datasets.ImageFolder( args.train_dir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), ToBGRTensor(), ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=1, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) assert os.path.exists(args.val_dir) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( args.val_dir, transforms.Compose([ OpencvResize(256), transforms.CenterCrop(224), ToBGRTensor(), ])), batch_size=200, shuffle=False, num_workers=1, pin_memory=use_gpu) val_dataprovider = DataIterator(val_loader) print('load data successfully') # Imagenet # from network import ShuffleNetV2_OneShot # model = ShuffleNetV2_OneShot(n_class=1000) # Special for cifar from network_origin import cifar_fast model = cifar_fast(input_size=32, n_class=100) # Optimizer optimizer = get_optim(args, model) # Label Smooth if args.label_smooth > 0: criterion = CrossEntropyLabelSmooth(100, args.label_smooth) else: # print('CrossEntropyLoss') criterion = nn.CrossEntropyLoss() if args.lr_scheduler == 'Lambda': scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / (args.epochs * train_step)) if step <= (args.epochs * train_step) else 0, last_epoch=-1) elif args.lr_scheduler == 'Cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs, eta_min=1e-8, last_epoch=-1) if use_gpu: model = nn.DataParallel(model) cudnn.benchmark = True loss_function = criterion.cuda() device = torch.device("cuda") else: loss_function = criterion device = torch.device("cpu") model = model.to(device) args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider args.best_acc = 0.0 args.all_iters = 1 return model, device, train_step, valid_step
def get_cand_err(model, cand, args): global train_dataprovider, val_dataprovider if train_dataprovider is None: use_gpu = False train_dataprovider = get_train_dataprovider(args.train_batch_size, use_gpu=True, num_workers=32) val_dataprovider = get_val_dataprovider(args.test_batch_size, use_gpu=True, num_workers=32) if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') max_train_iters = args.max_train_iters max_test_iters = args.max_test_iters print('clear bn statics....') # for m in model.modules(): # if isinstance(m, torch.nn.BatchNorm2d): # m.running_mean = torch.zeros_like(m.running_mean) # m.running_var = torch.ones_like(m.running_var) print('train bn with training set (BN sanitize) ....') # meta_model = ShuffleNetV2_OneShot() # meta_model = nn.DataParallel(meta_model) # meta_model = meta_model.to(device) # for p, q in zip(model.parameters(), meta_model.parameters()): # if p is not None: # q = p.clone() optimizer = torch.optim.SGD(get_parameters(model), lr=0.001) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) loss_function = criterion_smooth.cuda() model.train() for step in tqdm.tqdm(range(max_train_iters)): # print('train step: {} total: {}'.format(step,max_train_iters)) data, target = train_dataprovider.next() # print('get data',data.shape) target = target.type(torch.LongTensor) data, target = data.to(device), target.to(device) # print(type(data)) # data = data.requires_grad_() # data = torch.tensor(data.data, requires_grad=True) # data = data.cuda() # # target.requires_grad=True output = model(data, cand) # if step<10: # loss = loss_function(output, target) # optimizer.zero_grad() # loss.backward() # optimizer.step() del data, target, output top1 = 0 top5 = 0 total = 0 print('starting test....') model.eval() for step in tqdm.tqdm(range(max_test_iters)): # print('test step: {} total: {}'.format(step,max_test_iters)) data, target = val_dataprovider.next() batchsize = data.shape[0] # print('get data',data.shape) target = target.type(torch.LongTensor) data, target = data.to(device), target.to(device) logits = model(data, cand) prec1, prec5 = accuracy(logits, target, topk=(1, 5)) # print(prec1.item(),prec5.item()) top1 += prec1.item() * batchsize top5 += prec5.item() * batchsize total += batchsize del data, target, logits, prec1, prec5 top1, top5 = top1 / total, top5 / total top1, top5 = 1 - top1 / 100, 1 - top5 / 100 print('top1: {:.2f} top5: {:.2f}'.format(top1 * 100, top5 * 100)) return top1, top5
def _main(): parser = argparse.ArgumentParser("SPOS Evolutional Search") parser.add_argument("--port", type=int, default=8084) parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") parser.add_argument("--checkpoint", type=str, default="./data/checkpoint-150000.pth.tar") parser.add_argument( "--spos-preprocessing", action="store_true", default=False, help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--workers", type=int, default=6) parser.add_argument("--train-batch-size", type=int, default=128) parser.add_argument("--train-iters", type=int, default=200) parser.add_argument("--test-batch-size", type=int, default=512) parser.add_argument("--log-frequency", type=int, default=10) parser.add_argument("--label-smoothing", type=float, default=0.1) parser.add_argument("--evolution-sample-size", type=int, default=10) parser.add_argument("--evolution-population-size", type=int, default=50) parser.add_argument("--evolution-cycles", type=int, default=10) parser.add_argument( "--latency-filter", type=str, default=None, help="Apply latency filter by calling the name of the applied hardware." ) parser.add_argument("--latency-threshold", type=float, default=100) args = parser.parse_args() # use a fixed set of image will improve the performance torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True assert torch.cuda.is_available() base_model = ShuffleNetV2OneShot() criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing) if args.latency_filter: latency_filter = LatencyFilter(threshold=args.latency_threshold, predictor=args.latency_filter) else: latency_filter = None evaluator = FunctionalEvaluator(evaluate_acc, criterion=criterion, args=args) evolution_strategy = strategy.RegularizedEvolution( model_filter=latency_filter, sample_size=args.evolution_sample_size, population_size=args.evolution_population_size, cycles=args.evolution_cycles) exp = RetiariiExperiment(base_model, evaluator, strategy=evolution_strategy) exp_config = RetiariiExeConfig('local') exp_config.trial_concurrency = 2 exp_config.trial_gpu_number = 1 exp_config.max_trial_number = args.evolution_cycles exp_config.training_service.use_active_gpu = False exp_config.execution_engine = 'base' exp_config.dummy_input = [1, 3, 224, 224] exp.run(exp_config, args.port) print('Exported models:') for i, model in enumerate(exp.export_top_models(formatter='dict')): print(model) with open(f'architecture_final_{i}.json', 'w') as f: json.dump(get_archchoice_by_model(model), f, indent=4)
def pipeline(args, reporter): # Log for one Supernet floder = '{}/task_id_{}'.format(args.signal, args.task_id) path = os.path.join(arg.local, 'save', floder) if not os.path.isdir(path): os.makedirs(path) args.path = path log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('{}/log'.format(path)): os.mkdir('{}/log'.format(path)) fh = logging.FileHandler( os.path.join('{}/log/{}-task_id{}-train-{}{:02}{}'.format(path, args['signal'], args['task_id'], local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info('{}-task_id: {}'.format(args.signal, args.task_id)) # resource use_gpu = False if torch.cuda.is_available(): use_gpu = True # load dataset if args.num_classes==10: dataset_train, dataset_valid = dataset_cifar.get_dataset("cifar10", N=args.randaug_n, M=args.randaug_m, RandA=args.RandA) elif args.num_classes==100: dataset_train, dataset_valid = dataset_cifar.get_dataset("cifar100", N=args.randaug_n, M=args.randaug_m, RandA=args.RandA) split = 0.0 split_idx = 0 train_sampler = None if split > 0.0: sss = StratifiedShuffleSplit(n_splits=5, test_size=split, random_state=0) sss = sss.split(list(range(len(dataset_train))), dataset_train.targets) for _ in range(split_idx + 1): train_idx, valid_idx = next(sss) train_sampler = SubsetRandomSampler(train_idx) valid_sampler = SubsetSampler(valid_idx) else: valid_sampler = SubsetSampler([]) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=args.batch_size, shuffle=True if train_sampler is None else False, num_workers=32, pin_memory=True, sampler=train_sampler, drop_last=True) # valid_loader = torch.utils.data.DataLoader( # dataset_train, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True, # sampler=valid_sampler, drop_last=False) # valid_loader = torch.utils.data.DataLoader( dataset_valid, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True, drop_last=False) train_dataprovider = DataIterator(train_loader) val_dataprovider = DataIterator(valid_loader) args.test_interval = len(valid_loader) args.val_interval = int(len(dataset_train) / args.batch_size) # step print('load data successfully') # network if args.block == 5: model = ShuffleNetV2_OneShot_cifar(block=args['block'], n_class=args.num_classes) elif args.block == 12: model = SuperNetwork(shadow_bn=True, layers=args['block'], classes=args.num_classes) print("param size = %fMB" % count_parameters_in_MB(model)) elif args.block == 4: model = Network(num_classes=args.num_classes) # model = Network(net()).to(device).half() elif args.block == 3: model = Network_cifar(num_classes=args.num_classes) # lr and parameters # original optimizer lr & wd # test lr_range # args.learning_rate = args.learning_rate * (args['task_id']+ 1) # parameters divided into groups # test shuffle lr_group (4 stage * 5choice + 1base_lr == 21) # test mobile lr_group (12 stage * 12 choice + 1base_lr == 145) # test fast lr_group (3 stage * 1 choice + 1base_lr == 4) # lr_group = [i/100 for i in list(range(4,25,1))] # arch_search = list(np.random.randint(2) for i in range(5*2)) # optimizer = torch.optim.SGD(get_dif_lr_parameters(model, lr_group, arch_search), if args.different_hpo: if args['block']==5: nums_lr_group = args['block'] * args['choice'] + 1 lr_group = list(np.random.uniform(0.4, 0.8) for i in range(nums_lr_group)) optimizer = torch.optim.SGD(shuffle_dif_lr_parameters(model, lr_group), momentum=args.momentum, weight_decay=args.weight_decay) elif args['block']==12: nums_lr_group=145 lr_group = list(np.random.uniform(0.1, 0.3) for i in range(nums_lr_group)) optimizer = torch.optim.SGD(mobile_dif_lr_parameters(model, lr_group), momentum=args.momentum, weight_decay=args.weight_decay) elif args['block']==4: nums_lr_group=4 lr_l, lr_r = float(arg.lr_range.split(',')[0]), float(arg.lr_range.split(',')[1]) lr_group = list(np.random.uniform(lr_l, lr_r) for i in range(nums_lr_group)) optimizer = torch.optim.SGD(fast_dif_lr_parameters(model, lr_group), momentum=args.momentum, weight_decay=args.weight_decay) elif args['block'] == 3: nums_lr_group = 19 # 9 * 2 + 1 lr_l, lr_r = float(arg.lr_range.split(',')[0]), float(arg.lr_range.split(',')[1]) lr_group = list(np.random.uniform(lr_l, lr_r) for i in range(nums_lr_group)) optimizer = torch.optim.SGD(fast_19_lr_parameters(model, lr_group), momentum=args.momentum, weight_decay=args.weight_decay) # log lr # for param_group in optimizer.param_groups: # print(param_group['lr']) # save optim # torch.save(optimizer.state_dict(),'optimizer.pt') # optimizer.load_state_dict(torch.load('optimizer.pt')) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, # without hpo / glboal hpo momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = torch.optim.SGD(get_parameters(model), # lr=args.learning_rate, # momentum=args.momentum, # weight_decay=args.weight_decay) # lookahead optimizer # base_opt = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999)) # optimizer = Lookahead(base_opt, k=5, alpha=0.5) # blockly optimizer # base_opt_2 = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999)) # base_opt_3 = torch.optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.999)) # base_opt_group = [base_opt, base_opt_2, base_opt_3] # optimizer = BlocklyOptimizer(base_opt_group, k=5, alpha=0.5) # loss func, ls=0.1 criterion_smooth = CrossEntropyLabelSmooth(10, args['label_smooth']) # lr_scheduler is related to total_iters scheduler = torch.optim.lr_scheduler.LambdaLR \ (optimizer, lambda step: (1.0 - step / args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( # optimizer, float(args.total_iters / args.val_interval), eta_min=1e-8, last_epoch=-1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") model = model.to(device) all_iters = 0 if args.auto_continue: # load model lastest_model, iters = get_lastest_model(args.path) if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) print('load from checkpoint') for i in range(iters): scheduler.step() # lr Align args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') # model.load_state_dict(checkpoint, strict=True) model.load_state_dict(checkpoint['state_dict'], strict=True) validate(model, device, args, all_iters=all_iters) exit(0) # according to total_iters while all_iters < args.total_iters: all_iters, Top1_acc = \ train(model, device, args, bn_process=True, all_iters=all_iters, reporter=reporter)
def prepare(): args = get_args() use_gpu = False if torch.cuda.is_available(): use_gpu = True if args.cifar100: train_dataprovider, val_dataprovider, train_step, valid_step = dataset_cifar.get_dataset( "cifar100", batch_size=args.batch_size, RandA=args.randAugment) print('load data successfully') else: assert os.path.exists(args.train_dir) from dataset import DataIterator, SubsetSampler, OpencvResize, ToBGRTensor train_dataset = datasets.ImageFolder( args.train_dir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), ToBGRTensor(), ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=1, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) assert os.path.exists(args.val_dir) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( args.val_dir, transforms.Compose([ OpencvResize(256), transforms.CenterCrop(224), ToBGRTensor(), ])), batch_size=200, shuffle=False, num_workers=1, pin_memory=use_gpu) val_dataprovider = DataIterator(val_loader) print('load data successfully') # Imagenet # from network import ShuffleNetV2_OneShot # model = ShuffleNetV2_OneShot(n_class=1000) # Special for cifar from network_origin import cifar_fast model = cifar_fast(input_size=32, n_class=100) # Optimizer optimizer = get_optim(args, model) # Label Smooth if args.criterion_smooth: criterion = CrossEntropyLabelSmooth(100, 0.1) else: criterion = nn.CrossEntropyLoss() if args.lr_scheduler == 'Lambda': scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / (args.epochs * train_step)) if step <= (args.epochs * train_step) else 0, last_epoch=-1) elif args.lr_scheduler == 'Cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs, eta_min=1e-8, last_epoch=-1) if use_gpu: model = nn.DataParallel(model) cudnn.benchmark = True loss_function = criterion.cuda() device = torch.device("cuda") else: loss_function = criterion device = torch.device("cpu") model = model.to(device) args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider args.best_acc = 0.0 args.all_iters = 1 start_epoch = 1 for epoch in range(start_epoch, start_epoch + args.epochs): loss_output, train_acc = train_nni(args, model, device, epoch, train_step) acc, best_acc = test_nni(args, model, device, epoch, valid_step) print( 'Epoch {}, loss/train acc = {:.2f}/{:.2f}, val acc/best acc = {:.2f}/{:.2f},' .format(epoch, loss_output, train_acc, acc, best_acc))
def main(): args = get_args() args.world_size = args.gpus * args.nodes args.rank = args.gpus * args.nr + args.local_rank print("RANK: " + str(args.rank) + ", LOCAL RANK: " + str(args.local_rank)) # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('/home/admin/aihub/SinglePathOneShot/log'): os.mkdir('/home/admin/aihub/SinglePathOneShot/log') fh = logging.FileHandler( os.path.join( '/home/admin/aihub/SinglePathOneShot/log/train-{}{:02}{}'.format( local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True assert os.path.exists(args.train_dir) train_dataset = datasets.ImageFolder( args.train_dir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), ToBGRTensor(), ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=args.rank) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=32, pin_memory=True, sampler=train_sampler) train_dataprovider = DataIterator(train_loader) assert os.path.exists(args.val_dir) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( args.val_dir, transforms.Compose([ OpencvResize(256), transforms.CenterCrop(224), ToBGRTensor(), ])), batch_size=200, shuffle=False, num_workers=32, pin_memory=use_gpu) val_dataprovider = DataIterator(val_loader) print('load data successfully') dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=args.local_rank) # dist.init_process_group(backend='nccl', init_method='tcp://'+args.ip+':'+str(args.port), world_size=args.world_size, rank=args.rank) # dist.init_process_group(backend='nccl', init_method="file:///mnt/nas1/share_file", world_size=args.world_size, rank=args.rank) torch.cuda.set_device(args.local_rank) channels_scales = (1.0, ) * 20 model = ShuffleNetV2_OneShot(architecture=list(args.arch), channels_scales=channels_scales) device = torch.device(args.local_rank) model = model.cuda(args.local_rank) optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], find_unused_parameters=False) #,output_device=args.local_rank) # , loss_function = criterion_smooth.cuda() all_iters = 0 args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, device, args, all_iters=all_iters) exit(0) validate(model, device, args, all_iters=all_iters) while all_iters < args.total_iters: all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters) validate(model, device, args, all_iters=all_iters) all_iters = train(model, device, args, val_interval=int(1280000 / args.val_batch_size), bn_process=True, all_iters=all_iters) validate(model, device, args, all_iters=all_iters)
def main(): args = get_args() # archLoader arch_loader = ArchLoader(args.path) # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True train_dataset, val_dataset = get_dataset('cifar100') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=16, pin_memory=True) # train_dataprovider = DataIterator(train_loader) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=200, shuffle=False, num_workers=12, pin_memory=True) # val_dataprovider = DataIterator(val_loader) print('load data successfully') model = mutableResNet20() print('load model successfully') optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) model = model.to(device) # dp_model = torch.nn.parallel.DistributedDataParallel(model) all_iters = 0 if args.auto_continue: # 自动进行?? lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) print('load from checkpoint') for i in range(iters): scheduler.step() # 参数设置 args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_loader = train_loader args.val_loader = val_loader # args.train_dataprovider = train_dataprovider # args.val_dataprovider = val_dataprovider if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader) exit(0) while all_iters < args.total_iters: all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters, arch_loader=arch_loader, arch_batch=args.arch_batch)
def main(): args = get_args() # archLoader arch_loader = ArchLoader(args.path) # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m-%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}-{:02}-{:02}-{:.3f}'.format( local_time.tm_year % 2000, local_time.tm_mon, local_time.tm_mday, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True kwargs = {'num_workers': 4, 'pin_memory': True} train_loader = torch.utils.data.DataLoader(datasets.MNIST( root="./data", train=True, download=True, transform=transforms.Compose([ transforms.Resize(32), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader(datasets.MNIST( root="./data", train=False, transform=transforms.Compose([ transforms.Resize(32), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=False, **kwargs) model = mutableResNet20(num_classes=10) base_model = copy.deepcopy(model) logging.info('load model successfully') optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") base_model.cuda() else: loss_function = criterion_smooth device = torch.device("cpu") # scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, # lambda step: (1.0-step/args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( # optimizer, T_max=200) model = model.to(device) all_iters = 0 if args.auto_continue: lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) logging.info('load from checkpoint') for i in range(iters): scheduler.step() # 参数设置 args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_loader = train_loader args.val_loader = val_loader if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader) exit(0) # warmup weights if args.warmup is not None: logging.info("begin warmup weights") while all_iters < args.warmup: all_iters = train_supernet(model, device, args, bn_process=False, all_iters=all_iters) validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader) while all_iters < args.total_iters: all_iters = train_subnet(model, base_model, device, args, bn_process=False, all_iters=all_iters, arch_loader=arch_loader) logging.info("validate iter {}".format(all_iters)) if all_iters % 9 == 0: validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader) validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader)
def main(): args = get_args() # archLoader arch_loader = ArchLoader(args.path) # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True val_loader = torch.utils.data.DataLoader(datasets.MNIST( root="./data", train=False, transform=transforms.Compose([ transforms.Resize(32), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True) print('load data successfully') model = mutableResNet20(10) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") model = model.to(device) print("load model successfully") all_iters = 0 print('load from latest checkpoint') lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) # 参数设置 args.loss_function = loss_function args.val_dataloader = val_loader print("start to validate model") validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader)
def main(): args = get_args() # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler(os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True assert os.path.exists(args.train_dir) train_dataset = datasets.ImageFolder( args.train_dir, transforms.Compose([ transforms.RandomResizedCrop(args.im_size), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), ToBGRTensor(), ]) ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) assert os.path.exists(args.val_dir) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(args.val_dir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(args.im_size), ToBGRTensor(), ])), batch_size=200, shuffle=False, num_workers=8, pin_memory=use_gpu ) val_dataprovider = DataIterator(val_loader) print('load data successfully') arch_path='arch.pkl' if os.path.exists(arch_path): with open(arch_path,'rb') as f: architecture=pickle.load(f) else: raise NotImplementedError channels_scales = (1.0,)*20 model = ShuffleNetV2_OneShot(architecture=architecture, channels_scales=channels_scales, n_class=args.num_classes, input_size=args.im_size) print('flops:',get_flops(model)) optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(args.num_classes, 0.1) if use_gpu: # model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step : (1.0-step/args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) # model = model.to(device) model = model.cuda() all_iters = 0 if args.auto_continue: lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) print('load from checkpoint') for i in range(iters): scheduler.step() args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, device, args, all_iters=all_iters) exit(0) t = time.time() while all_iters < args.total_iters: all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters) validate(model, device, args, all_iters=all_iters) # all_iters = train(model, device, args, val_interval=int(1280000/args.batch_size), bn_process=True, all_iters=all_iters) validate(model, device, args, all_iters=all_iters) save_checkpoint({'state_dict': model.state_dict(),}, args.total_iters, tag='bnps-') print("Finished {} iters in {:.3f} seconds".format(all_iters, time.time()-t))
def main(): args = get_args() # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True assert os.path.exists(args.train_dir) train_dataset = datasets.ImageFolder( args.train_dir, transforms.Compose([ transforms.RandomResizedCrop(96), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), ToBGRTensor(), ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) assert os.path.exists(args.val_dir) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder( args.val_dir, transforms.Compose([ OpencvResize(96), # transforms.CenterCrop(96), ToBGRTensor(), ])), batch_size=200, shuffle=False, num_workers=4, pin_memory=use_gpu) val_dataprovider = DataIterator(val_loader) arch_path = 'cl400.p' if os.path.exists(arch_path): with open(arch_path, 'rb') as f: architectures = pickle.load(f) else: raise NotImplementedError channels_scales = (1.0, ) * 20 cands = {} splits = [(i, 10 + i) for i in range(0, 400, 10)] architectures = np.array(architectures) architectures = architectures[ splits[args.split_num][0]:splits[args.split_num][1]] print(len(architectures)) logging.info("Training and Validating arch: " + str(splits[args.split_num])) for architecture in architectures: architecture = tuple(architecture.tolist()) model = ShuffleNetV2_OneShot(architecture=architecture, channels_scales=channels_scales, n_class=10, input_size=96) print('flops:', get_flops(model)) optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) model = model.to(device) all_iters = 0 if args.auto_continue: lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load( lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) print('load from checkpoint') for i in range(iters): scheduler.step() args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider # print("BEGIN VALDATE: ", args.eval, args.eval_resume) if args.eval: if args.eval_resume is not None: checkpoint = torch.load( args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, device, args, all_iters=all_iters) exit(0) # t1,t5 = validate(model, device, args, all_iters=all_iters) # print("VALDATE: ", t1, " ", t5) while all_iters < args.total_iters: all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters) validate(model, device, args, all_iters=all_iters) all_iters = train(model, device, args, val_interval=int(1280000 / args.batch_size), bn_process=True, all_iters=all_iters) top1, top5 = validate(model, device, args, all_iters=all_iters) save_checkpoint({ 'state_dict': model.state_dict(), }, args.total_iters, tag='bnps-') cands[architecture] = [top1, top5] pickle.dump( cands, open("from_scratch_split_{}.pkl".format(args.split_num), 'wb'))
logger.warning( "You might want to use SPOS preprocessing if you are loading their checkpoints." ) model.load_state_dict(load_and_parse_state_dict()) model.cuda() if torch.cuda.device_count( ) > 1: # exclude last gpu, saving for data preprocessing on gpu model = nn.DataParallel(model, device_ids=list( range(0, torch.cuda.device_count() - 1))) mutator = SPOSSupernetTrainingMutator(model, flops_func=flops_func, flops_lb=290E6, flops_ub=360E6) criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / args.epochs) if step <= args.epochs else 0, last_epoch=-1) train_loader = get_imagenet_iter_dali( "train", args.imagenet_dir, args.batch_size, args.workers, spos_preprocessing=args.spos_preprocessing) valid_loader = get_imagenet_iter_dali(
def main(): args = get_args() num_gpus = torch.cuda.device_count() args.gpu = args.local_rank % num_gpus torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.batch_size = args.batch_size // args.world_size # archLoader arch_loader = ArchLoader(args.path) # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m-%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}-{:02}-{:02}-{:.3f}'.format( local_time.tm_year % 2000, local_time.tm_mon, local_time.tm_mday, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True train_loader = get_train_loader(args.batch_size, args.local_rank, args.num_workers, args.total_iters) val_loader = get_val_loader(args.batch_size, args.num_workers) model = mutableResNet20() logging.info('load model successfully') optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: # model = nn.DataParallel(model) model = model.cuda(args.gpu) model = nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) loss_function = criterion_smooth.cuda() else: loss_function = criterion_smooth scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5) all_iters = 0 if args.auto_continue: # 自动进行?? lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) logging.info('load from checkpoint') for i in range(iters): scheduler.step() # 参数设置 args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_loader = train_loader args.val_loader = val_loader if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, args, all_iters=all_iters, arch_loader=arch_loader) exit(0) # warmup weights if args.warmup > 0: logging.info("begin warmup weights") while all_iters < args.warmup: all_iters = train_supernet(model, args, bn_process=False, all_iters=all_iters) validate(model, args, all_iters=all_iters, arch_loader=arch_loader) while all_iters < args.total_iters: logging.info("=" * 50) all_iters = train_subnet(model, args, bn_process=False, all_iters=all_iters, arch_loader=arch_loader) if all_iters % 200 == 0 and args.local_rank == 0: logging.info("validate iter {}".format(all_iters)) validate(model, args, all_iters=all_iters, arch_loader=arch_loader)
def main(): args = get_args() # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True if args.cifar10 == False: assert os.path.exists(args.train_dir) train_dataset = datasets.ImageFolder( args.train_dir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), ToBGRTensor(), ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=1, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) assert os.path.exists(args.val_dir) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( args.val_dir, transforms.Compose([ OpencvResize(256), transforms.CenterCrop(224), ToBGRTensor(), ])), batch_size=200, shuffle=False, num_workers=1, pin_memory=use_gpu) val_dataprovider = DataIterator(val_loader) print('load imagenet data successfully') else: train_transform, valid_transform = data_transforms(args) trainset = torchvision.datasets.CIFAR10(root=os.path.join( args.data_dir, 'cifar'), train=True, download=True, transform=train_transform) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=8) train_dataprovider = DataIterator(train_loader) valset = torchvision.datasets.CIFAR10(root=os.path.join( args.data_dir, 'cifar'), train=False, download=True, transform=valid_transform) val_loader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8) val_dataprovider = DataIterator(val_loader) print('load cifar10 data successfully') model = ShuffleNetV2_OneShot() optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) model = model.to(device) all_iters = 0 if args.auto_continue: lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) print('load from checkpoint') for i in range(iters): scheduler.step() args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, device, args, all_iters=all_iters) exit(0) while all_iters < args.total_iters: all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters)
def train(model, optimizer, scheduler, train_loader, dev, epoch): model.train() total_loss = 0 num_batches = 0 total_correct = 0 count = 0 criterion = nn.CrossEntropyLoss() if opt.labelsmooth: criterion = CrossEntropyLabelSmooth() warm_up = min(1.0, 0.1 + 0.9 * epoch / opt.warm_epoch) warm_iteration = round(dataset_sizes['train'] / opt.batch_size) * opt.warm_epoch # first 5 epoch total_iteration = round( dataset_sizes['train'] / opt.batch_size) * opt.num_epochs with tqdm.tqdm(train_loader, ascii=True) as tq: for data, label in tq: num_examples = label.shape[0] data, label = data.to(dev), label.to(dev).squeeze().long() optimizer.zero_grad() xyz = data[:, :, 0:3].contiguous() rgb = data[:, :, 3:].contiguous() logits = model(xyz.detach(), rgb.detach(), istrain=True) #loss = compute_loss(logits, label) if opt.npart > 1: loss = criterion(logits[0], label) for i in range(1, opt.npart): loss += criterion(logits[i], label) else: loss = criterion(logits, label) if epoch < opt.warm_epoch: warm_up = min(1.0, warm_up + 0.9 / warm_iteration) loss *= warm_up loss.backward() optimizer.step() if opt.npart > 1: logit_sum = logits[0].detach() for i in range(1, opt.npart): logit_sum += logits[i].detach() _, preds = logit_sum.max(1) else: _, preds = logits.max(1) num_batches += 1 count += num_examples correct = (preds == label).sum().item() total_loss += loss.item() total_correct += correct tq.set_postfix({ #'Loss': '%.5f' % loss, 'AvgLoss': '%.4f' % (total_loss / num_batches), #'Acc': '%.5f' % (correct / num_examples), 'AvgAcc': '%.4f' % (total_correct / count) }) y_loss['train'].append(total_loss / num_batches) y_err['train'].append(1.0 - total_correct / count) scheduler.step()