def main(): if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) set_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: set_seed(args.local_rank) args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.local_rank == 0: logging.info("args = {}".format(args)) logging.info("unparsed_args = {}".format(unparsed)) logging.info("distributed = {}".format(args.distributed)) logging.info("opt_level = {}".format(args.opt_level)) logging.info("keep_batchnorm_fp32 = {}".format( args.keep_batchnorm_fp32)) logging.info("loss_scale = {}".format(args.loss_scale)) logging.info("CUDNN VERSION: {}".format( torch.backends.cudnn.version())) # create model if args.local_rank == 0: logging.info('parsing the architecture') if args.model_path and os.path.isfile(args.model_path): op_weights, depth_weights = get_op_and_depth_weights(args.model_path) parsed_arch = parse_architecture(op_weights, depth_weights) mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict'] mc_num_dddict = get_mc_num_dddict(mc_mask_dddict) model = Network(args.num_classes, parsed_arch, mc_num_dddict, None, args.dropout_rate, args.drop_connect_rate) elif args.config_path and os.path.isfile(args.config_path): model_config = json.load(open(args.config_path, 'r')) model = NetworkCfg(args.num_classes, model_config, None, args.dropout_rate, args.drop_connect_rate) else: raise Exception('invalid --model_path and --config_path') if args.sync_bn: if args.local_rank == 0: logging.info("using apex synced BN") model = parallel.convert_syncbn_model(model) model = model.cuda().to(memory_format=memory_format ) if memory_format is not None else model.cuda() config = model.config if args.local_rank == 0: with open(os.path.join(args.save, 'model.config'), 'w') as f: json.dump(config, f, indent=4) # logging.info(config) logging.info("param size = %fMB", count_parameters_in_MB(model)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(args.num_classes, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Initialize Amp if args.opt_level is not None: model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) else: model = nn.DataParallel(model) # define transform and initialize dataloader batch_size = args.batch_size // args.world_size workers = args.workers // args.world_size train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, #), hue=0.2), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) train_dataset = ImageList(root=args.train_root, list_path=args.train_list, transform=train_transform) val_dataset = ImageList(root=args.val_root, list_path=args.val_list, transform=val_transform) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=workers, pin_memory=True, sampler=train_sampler, shuffle=(train_sampler is None)) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=workers, pin_memory=True, sampler=val_sampler, shuffle=False) # define learning rate scheduler scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) best_acc_top1 = 0 best_acc_top5 = 0 start_epoch = 0 # restart from snapshot if args.snapshot and os.path.isfile(args.snapshot): if args.local_rank == 0: logging.info('loading snapshot from {}'.format(args.snapshot)) checkpoint = torch.load( args.snapshot, map_location=lambda storage, loc: storage.cuda(args.gpu)) start_epoch = checkpoint['epoch'] best_acc_top1 = checkpoint['best_acc_top1'] best_acc_top5 = checkpoint['best_acc_top5'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.opt_level is not None: amp.load_state_dict(checkpoint['amp']) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), last_epoch=0) for epoch in range(start_epoch): current_lr = scheduler.get_lr()[0] if args.local_rank == 0: logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 if args.local_rank == 0: logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step() # the main loop for epoch in range(start_epoch, args.epochs): current_lr = scheduler.get_lr()[0] if args.local_rank == 0: logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 if args.local_rank == 0: logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) if args.distributed: train_sampler.set_epoch(epoch) epoch_start = time.time() train_acc, train_obj = train(train_loader, model, criterion_smooth, optimizer) if args.local_rank == 0: logging.info('Train_acc: %f', train_acc) val_acc_top1, val_acc_top5, val_obj = validate(val_loader, model, criterion) if args.local_rank == 0: logging.info('Val_acc_top1: %f', val_acc_top1) logging.info('Val_acc_top5: %f', val_acc_top5) logging.info('Epoch time: %ds.', time.time() - epoch_start) if args.local_rank == 0: is_best = False if val_acc_top1 > best_acc_top1: best_acc_top1 = val_acc_top1 best_acc_top5 = val_acc_top5 is_best = True save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'best_acc_top5': best_acc_top5, 'optimizer': optimizer.state_dict(), 'amp': amp.state_dict() if args.opt_level is not None else None, }, is_best, args.save) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step()
logging.getLogger().addHandler(fh) if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) cudnn.benchmark = True cudnn.enabled = True SearchSpace = importlib.import_module('models.search_space_' + config.net_type).Network super_model = SearchSpace(config.optim.init_dim, config.data.dataset, config) super_model.eval() logging.info("Params = %fMB" % utils.count_parameters_in_MB(super_model)) if args.device == 'gpu': super_model = super_model.cuda() latency_list, total_latency = super_model.get_cost_list( args.input_size, cost_type='latency', use_gpu=(args.device == 'gpu'), meas_times=args.meas_times) logging.info('latency_list:\n' + str(latency_list)) logging.info('total latency: ' + str(total_latency) + 'ms') with open(os.path.join(args.save, args.list_name), 'w') as f: f.write(str(latency_list))
model.eval() if hasattr(model, 'net_config'): logging.info("Network Structure: \n" + '|\n'.join(map(str, model.net_config))) if args.meas_lat: latency_cpu = utils.latency_measure(model, (3, 224, 224), 1, 2000, mode='cpu') logging.info('latency_cpu (batch 1): %.2fms' % latency_cpu) latency_gpu = utils.latency_measure(model, (3, 224, 224), 32, 5000, mode='gpu') logging.info('latency_gpu (batch 32): %.2fms' % latency_gpu) params = utils.count_parameters_in_MB(model) logging.info("Params = %.2fMB" % params) mult_adds = comp_multadds(model, input_size=config.data.input_size) logging.info("Mult-Adds = %.2fMB" % mult_adds) model = nn.DataParallel(model) # whether to resume from a checkpoint if config.optim.if_resume: utils.load_model(model, config.optim.resume.load_path) start_epoch = config.optim.resume.load_epoch + 1 else: start_epoch = 0 model = model.cuda()
parsed_arch = parse_architecture(op_weights, depth_weights) with open(args.lookup_path, 'rb') as f: lat_lookup = pickle.load(f) mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict'] mc_num_dddict = get_mc_num_dddict(mc_mask_dddict) model = Network(1000, parsed_arch, mc_num_dddict, lat_lookup, 0.0, 0.0) model = model.cuda() x = torch.randn((1, 3, 224, 224)) x = x.cuda() config = model.config with open(args.save_path, 'w') as f: json.dump(config, f, indent=4) params = count_parameters_in_MB(model) print('Params: \t{:.4f}MB'.format(params)) flops = calculate_FLOPs_in_M(model, (1, 3, 224, 224)) print('FLOPs: \t{:.4f}M'.format(flops)) if args.print_lat: # latency in lookup table lat_lut = model.get_lookup_latency(x) print('Lat_LUT:\t{:.4f}ms'.format(lat_lut)) lat_gpu = measure_latency_in_ms(model, (32, 3, 224, 224), is_cuda=True) print('Lat_GPU bs=32:\t{:.4f}ms'.format(lat_gpu)) lat_gpu = measure_latency_in_ms(model, (1, 3, 224, 224), is_cuda=True) print('Lat_GPU bs=1:\t{:.4f}ms'.format(lat_gpu))
def main(): if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) set_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True logging.info("args = %s", args) logging.info("unparsed_args = %s", unparsed) # create model logging.info('parsing the architecture') if args.model_path and os.path.isfile(args.model_path): op_weights, depth_weights = get_op_and_depth_weights(args.model_path) parsed_arch = parse_architecture(op_weights, depth_weights) mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict'] mc_num_dddict = get_mc_num_dddict(mc_mask_dddict) model = Network(args.num_classes, parsed_arch, mc_num_dddict, None, args.dropout_rate, args.drop_connect_rate) elif args.config_path and os.path.isfile(args.config_path): model_config = json.load(open(args.config_path, 'r')) model = NetworkCfg(args.num_classes, model_config, None, args.dropout_rate, args.drop_connect_rate) else: raise Exception('invalid --model_path and --config_path') model = nn.DataParallel(model).cuda() config = model.module.config with open(os.path.join(args.save, 'model.config'), 'w') as f: json.dump(config, f, indent=4) # logging.info(config) logging.info("param size = %fMB", count_parameters_in_MB(model)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(args.num_classes, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # define transform and initialize dataloader train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, #), hue=0.2), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) train_queue = torch.utils.data.DataLoader(ImageList( root=args.train_root, list_path=args.train_list, transform=train_transform, ), batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.workers) val_queue = torch.utils.data.DataLoader(ImageList( root=args.val_root, list_path=args.val_list, transform=val_transform, ), batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.workers) # define learning rate scheduler scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) best_acc_top1 = 0 best_acc_top5 = 0 start_epoch = 0 # restart from snapshot if args.snapshot: logging.info('loading snapshot from {}'.format(args.snapshot)) checkpoint = torch.load(args.snapshot) start_epoch = checkpoint['epoch'] best_acc_top1 = checkpoint['best_acc_top1'] best_acc_top5 = checkpoint['best_acc_top5'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), last_epoch=0) for epoch in range(start_epoch): current_lr = scheduler.get_lr()[0] logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step() # the main loop for epoch in range(start_epoch, args.epochs): current_lr = scheduler.get_lr()[0] logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) epoch_start = time.time() train_acc, train_obj = train(train_queue, model, criterion_smooth, optimizer) logging.info('Train_acc: %f', train_acc) val_acc_top1, val_acc_top5, val_obj = validate(val_queue, model, criterion) logging.info('Val_acc_top1: %f', val_acc_top1) logging.info('Val_acc_top5: %f', val_acc_top5) logging.info('Epoch time: %ds.', time.time() - epoch_start) is_best = False if val_acc_top1 > best_acc_top1: best_acc_top1 = val_acc_top1 best_acc_top5 = val_acc_top5 is_best = True save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'best_acc_top5': best_acc_top5, 'optimizer': optimizer.state_dict(), }, is_best, args.save) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step()
cudnn.enabled = True utils.set_seed(config.data.seed) logging.info("args = %s", args) logging.info('Training with config:') logging.info(pprint.pformat(config)) config.net_config, net_type = utils.load_net_config( os.path.join(args.load_path, 'net_config')) derivedNetwork = getattr(model_derived, '%s_Net' % net_type.upper()) model = derivedNetwork(config.net_config, config=config, num_classes=1000) logging.info("Network Structure: \n" + '\n'.join(map(str, model.net_config))) logging.info("Params = %.2fMB" % utils.count_parameters_in_MB(model)) logging.info("Mult-Adds = %.2fMB" % comp_multadds(model, input_size=config.data.input_size)) model = model.cuda() model = nn.DataParallel(model) checkpoint = torch.load(os.path.join(args.load_path, 'weight.pt'), map_location="cpu") # weight checkpoint model.load_state_dict(checkpoint['state_dict'], strict=False) imagenet = imagenet_data.ImageNet12( trainFolder=os.path.join(args.data_path, 'train'), testFolder=os.path.join(args.data_path, 'val'), num_workers=config.data.num_workers, data_config=config.data) valid_queue = imagenet.getTestLoader(config.data.batch_size)
def main(): if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True logging.info("args = %s", args) with open(args.lookup_path, 'rb') as f: lat_lookup = pickle.load(f) mc_maxnum_dddict = get_mc_num_dddict(mc_mask_dddict, is_max=True) model = Network(args.num_classes, mc_maxnum_dddict, lat_lookup) model = torch.nn.DataParallel(model).cuda() logging.info("param size = %fMB", count_parameters_in_MB(model)) # save initial model model_path = os.path.join(args.save, 'searched_model_00.pth.tar') torch.save( { 'state_dict': model.state_dict(), 'mc_mask_dddict': mc_mask_dddict, }, model_path) # get lr list lr_list = [] optimizer_w = torch.optim.SGD(model.module.weight_parameters(), lr=args.w_lr, momentum=args.w_mom, weight_decay=args.w_wd) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer_w, float(args.epochs)) for _ in range(args.epochs): lr = scheduler.get_lr()[0] lr_list.append(lr) scheduler.step() del model del optimizer_w del scheduler criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), normalize, ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) train_queue = torch.utils.data.DataLoader(ImageList( root=args.img_root, list_path=args.train_list, transform=train_transform), batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.workers) val_queue = torch.utils.data.DataLoader(ImageList(root=args.img_root, list_path=args.val_list, transform=val_transform), batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.workers) for epoch in range(args.epochs): mc_num_dddict = get_mc_num_dddict(mc_mask_dddict) model = Network(args.num_classes, mc_num_dddict, lat_lookup) model = torch.nn.DataParallel(model).cuda() model.module.set_temperature(args.T) # load model model_path = os.path.join(args.save, 'searched_model_{:02}.pth.tar'.format(epoch)) state_dict = torch.load(model_path)['state_dict'] for key in state_dict: if 'm_ops' not in key: exec('model.{}.data = state_dict[key].data'.format(key)) for stage in mc_mask_dddict: for block in mc_mask_dddict[stage]: for op_idx in mc_mask_dddict[stage][block]: index = torch.nonzero( mc_mask_dddict[stage][block][op_idx]).view(-1) index = index.cuda() iw = 'model.module.{}.{}.m_ops[{}].inverted_bottleneck.conv.weight.data'.format( stage, block, op_idx) iw_key = 'module.{}.{}.m_ops.{}.inverted_bottleneck.conv.weight'.format( stage, block, op_idx) exec( iw + ' = torch.index_select(state_dict[iw_key], 0, index).data' ) dw = 'model.module.{}.{}.m_ops[{}].depth_conv.conv.weight.data'.format( stage, block, op_idx) dw_key = 'module.{}.{}.m_ops.{}.depth_conv.conv.weight'.format( stage, block, op_idx) exec( dw + ' = torch.index_select(state_dict[dw_key], 0, index).data' ) pw = 'model.module.{}.{}.m_ops[{}].point_linear.conv.weight.data'.format( stage, block, op_idx) pw_key = 'module.{}.{}.m_ops.{}.point_linear.conv.weight'.format( stage, block, op_idx) exec( pw + ' = torch.index_select(state_dict[pw_key], 1, index).data' ) if op_idx >= 4: se_cr_w = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_reduce.weight.data'.format( stage, block, op_idx) se_cr_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.weight'.format( stage, block, op_idx) exec( se_cr_w + ' = torch.index_select(state_dict[se_cr_w_key], 1, index).data' ) se_cr_b = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_reduce.bias.data'.format( stage, block, op_idx) se_cr_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.bias'.format( stage, block, op_idx) exec(se_cr_b + ' = state_dict[se_cr_b_key].data') se_ce_w = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_expand.weight.data'.format( stage, block, op_idx) se_ce_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.weight'.format( stage, block, op_idx) exec( se_ce_w + ' = torch.index_select(state_dict[se_ce_w_key], 0, index).data' ) se_ce_b = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_expand.bias.data'.format( stage, block, op_idx) se_ce_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.bias'.format( stage, block, op_idx) exec( se_ce_b + ' = torch.index_select(state_dict[se_ce_b_key], 0, index).data' ) del index lr = lr_list[epoch] optimizer_w = torch.optim.SGD(model.module.weight_parameters(), lr=lr, momentum=args.w_mom, weight_decay=args.w_wd) optimizer_a = torch.optim.Adam(model.module.arch_parameters(), lr=args.a_lr, betas=(args.a_beta1, args.a_beta2), weight_decay=args.a_wd) logging.info('Epoch: %d lr: %e T: %e', epoch, lr, args.T) # training epoch_start = time.time() if epoch < 10: train_acc = train_wo_arch(train_queue, model, criterion, optimizer_w) else: train_acc = train_w_arch(train_queue, val_queue, model, criterion, optimizer_w, optimizer_a) args.T *= args.T_decay # logging arch parameters logging.info('The current arch parameters are:') for param in model.module.log_alphas_parameters(): param = np.exp(param.detach().cpu().numpy()) logging.info(' '.join(['{:.6f}'.format(p) for p in param])) for param in model.module.betas_parameters(): param = F.softmax(param.detach().cpu(), dim=-1) param = param.numpy() logging.info(' '.join(['{:.6f}'.format(p) for p in param])) logging.info('Train_acc %f', train_acc) epoch_duration = time.time() - epoch_start logging.info('Epoch time: %ds', epoch_duration) # validation for last 5 epochs if args.epochs - epoch < 5: val_acc = validate(val_queue, model, criterion) logging.info('Val_acc %f', val_acc) # update state_dict state_dict_from_model = model.state_dict() for key in state_dict: if 'm_ops' not in key: state_dict[key].data = state_dict_from_model[key].data for stage in mc_mask_dddict: for block in mc_mask_dddict[stage]: for op_idx in mc_mask_dddict[stage][block]: index = torch.nonzero( mc_mask_dddict[stage][block][op_idx]).view(-1) index = index.cuda() iw_key = 'module.{}.{}.m_ops.{}.inverted_bottleneck.conv.weight'.format( stage, block, op_idx) state_dict[iw_key].data[ index, :, :, :] = state_dict_from_model[iw_key] dw_key = 'module.{}.{}.m_ops.{}.depth_conv.conv.weight'.format( stage, block, op_idx) state_dict[dw_key].data[ index, :, :, :] = state_dict_from_model[dw_key] pw_key = 'module.{}.{}.m_ops.{}.point_linear.conv.weight'.format( stage, block, op_idx) state_dict[ pw_key].data[:, index, :, :] = state_dict_from_model[ pw_key] if op_idx >= 4: se_cr_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.weight'.format( stage, block, op_idx) state_dict[ se_cr_w_key].data[:, index, :, :] = state_dict_from_model[ se_cr_w_key] se_cr_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.bias'.format( stage, block, op_idx) state_dict[ se_cr_b_key].data[:] = state_dict_from_model[ se_cr_b_key] se_ce_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.weight'.format( stage, block, op_idx) state_dict[se_ce_w_key].data[ index, :, :, :] = state_dict_from_model[ se_ce_w_key] se_ce_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.bias'.format( stage, block, op_idx) state_dict[se_ce_b_key].data[ index] = state_dict_from_model[se_ce_b_key] del state_dict_from_model, index # shrink and expand if epoch >= 10: logging.info('Now shrinking or expanding the arch') op_weights, depth_weights = get_op_and_depth_weights(model) parsed_arch = parse_architecture(op_weights, depth_weights) mc_num_dddict = get_mc_num_dddict(mc_mask_dddict) before_lat = get_lookup_latency(parsed_arch, mc_num_dddict, lat_lookup_key_dddict, lat_lookup) logging.info( 'Before, the current lat: {:.4f}, the target lat: {:.4f}'. format(before_lat, args.target_lat)) if before_lat > args.target_lat: logging.info('Shrinking......') stages = ['stage{}'.format(x) for x in range(1, 7)] mc_num_dddict, after_lat = fit_mc_num_by_latency( parsed_arch, mc_num_dddict, mc_maxnum_dddict, lat_lookup_key_dddict, lat_lookup, args.target_lat, stages, sign=-1) for start in range(2, 7): stages = ['stage{}'.format(x) for x in range(start, 7)] mc_num_dddict, after_lat = fit_mc_num_by_latency( parsed_arch, mc_num_dddict, mc_maxnum_dddict, lat_lookup_key_dddict, lat_lookup, args.target_lat, stages, sign=1) elif before_lat < args.target_lat: logging.info('Expanding......') stages = ['stage{}'.format(x) for x in range(1, 7)] mc_num_dddict, after_lat = fit_mc_num_by_latency( parsed_arch, mc_num_dddict, mc_maxnum_dddict, lat_lookup_key_dddict, lat_lookup, args.target_lat, stages, sign=1) for start in range(2, 7): stages = ['stage{}'.format(x) for x in range(start, 7)] mc_num_dddict, after_lat = fit_mc_num_by_latency( parsed_arch, mc_num_dddict, mc_maxnum_dddict, lat_lookup_key_dddict, lat_lookup, args.target_lat, stages, sign=1) else: logging.info('No opeartion') after_lat = before_lat # change mc_mask_dddict based on mc_num_dddict for stage in parsed_arch: for block in parsed_arch[stage]: op_idx = parsed_arch[stage][block] if mc_num_dddict[stage][block][op_idx] != int( sum(mc_mask_dddict[stage][block][op_idx]).item()): mc_num = mc_num_dddict[stage][block][op_idx] max_mc_num = mc_mask_dddict[stage][block][op_idx].size( 0) mc_mask_dddict[stage][block][op_idx].data[ [True] * max_mc_num] = 0.0 key = 'module.{}.{}.m_ops.{}.depth_conv.conv.weight'.format( stage, block, op_idx) weight_copy = state_dict[key].clone().abs().cpu( ).numpy() weight_l1_norm = np.sum(weight_copy, axis=(1, 2, 3)) weight_l1_order = np.argsort(weight_l1_norm) weight_l1_order_rev = weight_l1_order[::-1][:mc_num] mc_mask_dddict[stage][block][op_idx].data[ weight_l1_order_rev.tolist()] = 1.0 logging.info( 'After, the current lat: {:.4f}, the target lat: {:.4f}'. format(after_lat, args.target_lat)) # save model model_path = os.path.join( args.save, 'searched_model_{:02}.pth.tar'.format(epoch + 1)) torch.save( { 'state_dict': state_dict, 'mc_mask_dddict': mc_mask_dddict, }, model_path)
logging.info("Super Network flops (M) list: \n") logging.info(str(flops_list)) logging.info("Total flops: " + str(total_flops)) elif config.optim.sub_obj.type == 'latency': with open( os.path.join('latency_list', config.optim.sub_obj.latency_list_path), 'r') as f: latency_list = eval(f.readline()) super_model.module.sub_obj_list = latency_list logging.info("Super Network latency (ms) list: \n") logging.info(str(latency_list)) else: raise NotImplementedError logging.info("Num Params = %.2fMB", utils.count_parameters_in_MB(super_model)) if config.data.dataset == 'imagenet': imagenet = imagenet_data.ImageNet12( trainFolder=os.path.join(args.data_path, 'train'), testFolder=os.path.join(args.data_path, 'val'), num_workers=config.data.num_workers, type_of_data_augmentation=config.data.type_of_data_aug, data_config=config.data) train_queue, valid_queue = imagenet.getTrainTestLoader( config.data.batch_size, train_shuffle=True, val_shuffle=True) else: raise NotImplementedError search_optim = Optimizer(super_model, criterion, config)
train_loader = ClusterLoader(cluster_data, batch_size=150, shuffle=True, num_workers=12) subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=1024, shuffle=False, num_workers=12) ######!!!!这里选择结构 genotype = eval("genotypes.%s" % args.arch) #eval()执行一个字符串表达式,并返回表达式的值。 model = Network(args.init_channels, args.classes, args.num_cells, genotype, in_channels=args.in_channels) model = model.to(DEVICE) print("param size = {:.6f}MB".format(utils.count_parameters_in_MB(model))) criterion = nn.BCEWithLogitsLoss().to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) # optimizer = torch.optim.SGD(model.parameters(),lr=args.learning_rate,momentum=args.momentum) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) main()
def main(): np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, NUM_CLASSES, args.layers, config.optim.auxiliary, genotype) start_epoch = 0 model.eval() model.drop_path_prob = args.drop_path_prob * 0 # compute the params as well as the multi-adds params = count_parameters_in_MB(model) logging.info("Params = %.2fMB" % params) mult_adds = comp_multadds(model, input_size=config.data.input_size) logging.info("Mult-Adds = %.2fMB" % mult_adds) model.train() if len(args.gpus) > 1: model = nn.DataParallel(model) model = model.cuda() if config.optim.label_smooth: criterion = CrossEntropyLabelSmooth(NUM_CLASSES, config.optim.smooth_alpha) else: criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), config.optim.init_lr, momentum=config.optim.momentum, weight_decay=config.optim.weight_decay) imagenet = imagenet_data.ImageNet12( trainFolder=os.path.join(args.data_path, 'train'), testFolder=os.path.join(args.data_path, 'val'), num_workers=config.data.num_workers, type_of_data_augmentation=config.data.type_of_data_aug, data_config=config.data, size_images=config.data.input_size[1], scaled_size=config.data.scaled_size[1]) train_queue, valid_queue = imagenet.getTrainTestLoader( config.data.batch_size) if config.optim.lr_schedule == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(config.train_params.epochs)) trainer = Trainer(train_queue, valid_queue, criterion, config, args.report_freq) best_epoch = [0, 0, 0] # [epoch, acc_top1, acc_top5] lr = config.optim.init_lr for epoch in range(start_epoch, config.train_params.epochs): if config.optim.lr_schedule == 'cosine': scheduler.step() current_lr = scheduler.get_lr()[0] elif config.optim.lr_schedule == 'linear': # with warmup initial optimizer, current_lr = adjust_lr(optimizer, config.train_params.epochs, lr, epoch) else: print('Wrong lr type, exit') sys.exit(1) if epoch < 5: # Warmup epochs for 5 current_lr = lr * (epoch + 1) / 5.0 for param_group in optimizer.param_groups: param_group['lr'] = current_lr logging.info('Warming-up Epoch: %d, LR: %e', epoch, lr * (epoch + 1) / 5.0) logging.info('Epoch: %d lr %e', epoch, current_lr) if len(args.gpus) > 1: model.module.drop_path_prob = args.drop_path_prob * epoch / config.train_params.epochs else: model.drop_path_prob = args.drop_path_prob * epoch / config.train_params.epochs train_acc_top1, train_acc_top5, train_obj, batch_time, data_time = trainer.train( model, optimizer, epoch) with torch.no_grad(): val_acc_top1, val_acc_top5, batch_time, data_time = trainer.infer( model, epoch) if val_acc_top1 > best_epoch[1]: best_epoch = [epoch, val_acc_top1, val_acc_top5] if epoch >= 0: # 120 utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'best_acc_top1': val_acc_top1, 'optimizer': optimizer.state_dict(), }, save_path=args.save, epoch=epoch, is_best=True) if len(args.gpus) > 1: utils.save( model.module.state_dict(), os.path.join( args.save, 'weights_{}_{}.pt'.format(epoch, val_acc_top1))) else: utils.save( model.state_dict(), os.path.join( args.save, 'weights_{}_{}.pt'.format(epoch, val_acc_top1))) logging.info('BEST EPOCH %d val_top1 %.2f val_top5 %.2f', best_epoch[0], best_epoch[1], best_epoch[2]) logging.info( 'epoch: {} \t train_acc_top1: {:.4f} \t train_loss: {:.4f} \t val_acc_top1: {:.4f}' .format(epoch, train_acc_top1, train_obj, val_acc_top1)) logging.info("Params = %.2fMB" % params) logging.info("Mult-Adds = %.2fMB" % mult_adds)
def main(): if not torch.cuda.is_available(): print('no gpu device available') sys.exit(1) if args.random_seed: args.seed = np.random.randint(0, 1000, 1) # reproducible ,再次运行代码时,初始化值不变。 #you should ensure that all other libraries your code relies on and which use random numbers also use a fixed seed. np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) criterion = torch.nn.BCEWithLogitsLoss().cuda() ## in_channels是特征维度 !! model = Network(args.init_channels, args.classes, args.num_cells, criterion, args.n_steps, in_channels=args.in_channels).cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) num_edges = model._steps * 2 post_train = 5 args.epochs = args.warmup_dec_epoch + args.decision_freq * ( num_edges - 1) + post_train + 1 logging.info("total epochs: %d", args.epochs) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) normal_selected_idxs = torch.tensor(len(model.alphas_normal) * [-1], requires_grad=False, dtype=torch.int).cuda() normal_candidate_flags = torch.tensor(len(model.alphas_normal) * [True], requires_grad=False, dtype=torch.bool).cuda() logging.info('normal_selected_idxs: {}'.format(normal_selected_idxs)) logging.info('normal_candidate_flags: {}'.format(normal_candidate_flags)) model.normal_selected_idxs = normal_selected_idxs model.normal_candidate_flags = normal_candidate_flags print(F.softmax(torch.stack(model.alphas_normal, dim=0), dim=-1).detach()) normal_probs_history = [] train_losses, valid_losses = utils.AverageMeter(), utils.AverageMeter() for epoch in range(args.epochs): lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) # training train_acc, train_loss = train(model, architect, criterion, optimizer, lr) print("!!!!!!!!!!!!!!!!train_loss:", train_loss) valid_acc, valid_losses = infer(model, criterion, valid_losses) logging.info('train_acc %f\tvalid_acc %f', train_acc, valid_acc) # make edge decisions saved_memory_normal, model.normal_selected_idxs, \ model.normal_candidate_flags = edge_decision('normal', model.alphas_normal, model.normal_selected_idxs, model.normal_candidate_flags, normal_probs_history, epoch, model, args) writer.add_scalar('stats/train_acc', train_acc, epoch) writer.add_scalar('stats/valid_acc', valid_acc, epoch) utils.save(model, os.path.join(args.save, 'search_weights.pt')) scheduler.step() logging.info("#" * 30 + " Done " + "#" * 30) logging.info('genotype = %s', model.get_genotype())