def eval(args): train_reader = None test_reader = None if args.data == "mnist": val_dataset = paddle.vision.datasets.MNIST(mode='test') class_dim = 10 image_shape = "1,28,28" elif args.data == "imagenet": import imagenet_reader as reader val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) image = paddle.static.data(name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) valid_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, batch_size=args.batch_size, shuffle=False) load_model(exe, val_program, args.model_path) acc_top1_ns = [] acc_top5_ns = [] for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( val_program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".format( batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info("Final eval - acc_top1: {}; acc_top5: {}".format( np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))
def quantize(args): shuffle = True if args.ce_test: # set seed seed = 111 np.random.seed(seed) paddle.seed(seed) random.seed(seed) shuffle = False place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() val_dataset = reader.ImageNetDataset(mode='test') image_shape = [3, 224, 224] image = paddle.static.data(name=args.input_name, shape=[None] + image_shape, dtype='float32') data_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[image], drop_last=False, return_list=False, batch_size=args.batch_size, shuffle=False) assert os.path.exists(args.model_path), "args.model_path doesn't exist" assert os.path.isdir(args.model_path), "args.model_path must be a dir" exe = paddle.static.Executor(place) quant_post_static(executor=exe, model_dir=args.model_path, quantize_model_path=args.save_path, data_loader=data_loader, model_filename=args.model_filename, params_filename=args.params_filename, batch_size=args.batch_size, batch_nums=args.batch_num, algo=args.algo, round_type=args.round_type, hist_percent=args.hist_percent, is_full_quantize=args.is_full_quantize, bias_correction=args.bias_correction, onnx_format=args.onnx_format)
def compress(args): if args.use_gpu: place = paddle.set_device('gpu') else: place = paddle.set_device('cpu') trainer_num = paddle.distributed.get_world_size() use_data_parallel = trainer_num != 1 if use_data_parallel: dist.init_parallel_env() train_reader = None test_reader = None if args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 elif args.data == "cifar10": normalize = T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], data_format='CHW') transform = T.Compose([T.Transpose(), normalize]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', backend='cv2', transform=transform) val_dataset = paddle.vision.datasets.Cifar10(mode='test', backend='cv2', transform=transform) class_dim = 10 else: raise ValueError("{} is not supported.".format(args.data)) batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(train_dataset, places=place, batch_sampler=batch_sampler, return_list=True, num_workers=args.num_workers, use_shared_memory=True) valid_loader = paddle.io.DataLoader( val_dataset, places=place, drop_last=False, return_list=True, batch_size=args.batch_size_for_validation, shuffle=False, use_shared_memory=True) step_per_epoch = int( np.ceil(len(train_dataset) / args.batch_size / ParallelEnv().nranks)) # model definition model = mobilenet_v1(num_classes=class_dim, pretrained=True) if ParallelEnv().nranks > 1: model = paddle.DataParallel(model) if args.pretrained_model is not None: model.set_state_dict(paddle.load(args.pretrained_model)) opt, learning_rate = create_optimizer(args, step_per_epoch, model) def test(epoch): model.eval() acc_top1_ns = [] acc_top5_ns = [] for batch_id, data in enumerate(valid_loader): start_time = time.time() x_data = data[0] y_data = paddle.to_tensor(data[1]) if args.data == 'cifar10': y_data = paddle.unsqueeze(y_data, 1) logits = model(x_data) loss = F.cross_entropy(logits, y_data) acc_top1 = paddle.metric.accuracy(logits, y_data, k=1) acc_top5 = paddle.metric.accuracy(logits, y_data, k=5) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, np.mean(acc_top1.numpy()), np.mean(acc_top5.numpy()), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1.numpy())) acc_top5_ns.append(np.mean(acc_top5.numpy())) _logger.info( "Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns, dtype="object")), np.mean(np.array(acc_top5_ns, dtype="object")))) def train(epoch): model.train() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start x_data = data[0] y_data = paddle.to_tensor(data[1]) if args.data == 'cifar10': y_data = paddle.unsqueeze(y_data, 1) train_start = time.time() logits = model(x_data) loss = F.cross_entropy(logits, y_data) acc_top1 = paddle.metric.accuracy(logits, y_data, k=1) acc_top5 = paddle.metric.accuracy(logits, y_data, k=5) loss.backward() opt.step() learning_rate.step() opt.clear_grad() pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size * ParallelEnv().nranks if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec" .format( epoch, batch_id, opt.get_lr(), np.mean(loss.numpy()), np.mean(acc_top1.numpy()), np.mean(acc_top5.numpy()), train_reader_cost / args.log_period, (train_reader_cost + train_run_cost) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() pruner = UnstructuredPruner(model, mode=args.pruning_mode, ratio=args.ratio, threshold=args.threshold) for i in range(args.resume_epoch + 1, args.num_epochs): train(i) if (i + 1) % args.test_period == 0: pruner.update_params() _logger.info( "The current density of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse(model), 2))) test(i) if (i + 1) % args.model_period == 0: pruner.update_params() paddle.save(model.state_dict(), os.path.join(args.model_path, "model-pruned.pdparams")) paddle.save(opt.state_dict(), os.path.join(args.model_path, "opt-pruned.pdopt"))
def compress(args): num_workers = 4 shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) num_workers = 0 shuffle = False if args.data == "mnist": train_dataset = paddle.vision.datasets.MNIST(mode='train') val_dataset = paddle.vision.datasets.MNIST(mode='test') class_dim = 10 image_shape = "1,28,28" elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) image = paddle.static.data(name='image', shape=[None] + image_shape, dtype='float32') if args.use_pact: image.stop_gradient = False label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) train_prog = paddle.static.default_main_program() val_program = paddle.static.default_main_program().clone(for_test=True) if not args.analysis: learning_rate, opt = create_optimizer(args) opt.minimize(avg_cost) place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) train_loader = paddle.io.DataLoader(train_dataset, places=places, feed_list=[image, label], drop_last=True, return_list=False, batch_size=args.batch_size, use_shared_memory=True, shuffle=shuffle, num_workers=num_workers) valid_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, batch_size=args.batch_size, use_shared_memory=True, shuffle=False) if args.analysis: # get all activations names activates = [ 'pool2d_1.tmp_0', 'tmp_35', 'batch_norm_21.tmp_2', 'tmp_26', 'elementwise_mul_5.tmp_0', 'pool2d_5.tmp_0', 'elementwise_add_5.tmp_0', 'relu_2.tmp_0', 'pool2d_3.tmp_0', 'conv2d_40.tmp_2', 'elementwise_mul_0.tmp_0', 'tmp_62', 'elementwise_add_8.tmp_0', 'batch_norm_39.tmp_2', 'conv2d_32.tmp_2', 'tmp_17', 'tmp_5', 'elementwise_add_9.tmp_0', 'pool2d_4.tmp_0', 'relu_0.tmp_0', 'tmp_53', 'relu_3.tmp_0', 'elementwise_add_4.tmp_0', 'elementwise_add_6.tmp_0', 'tmp_11', 'conv2d_36.tmp_2', 'relu_8.tmp_0', 'relu_5.tmp_0', 'pool2d_7.tmp_0', 'elementwise_add_2.tmp_0', 'elementwise_add_7.tmp_0', 'pool2d_2.tmp_0', 'tmp_47', 'batch_norm_12.tmp_2', 'elementwise_mul_6.tmp_0', 'elementwise_mul_7.tmp_0', 'pool2d_6.tmp_0', 'relu_6.tmp_0', 'elementwise_add_0.tmp_0', 'elementwise_mul_3.tmp_0', 'conv2d_12.tmp_2', 'elementwise_mul_2.tmp_0', 'tmp_8', 'tmp_2', 'conv2d_8.tmp_2', 'elementwise_add_3.tmp_0', 'elementwise_mul_1.tmp_0', 'pool2d_8.tmp_0', 'conv2d_28.tmp_2', 'image', 'conv2d_16.tmp_2', 'batch_norm_33.tmp_2', 'relu_1.tmp_0', 'pool2d_0.tmp_0', 'tmp_20', 'conv2d_44.tmp_2', 'relu_10.tmp_0', 'tmp_41', 'relu_4.tmp_0', 'elementwise_add_1.tmp_0', 'tmp_23', 'batch_norm_6.tmp_2', 'tmp_29', 'elementwise_mul_4.tmp_0', 'tmp_14' ] var_collector = VarCollector(train_prog, activates, use_ema=True) values = var_collector.abs_max_run(train_loader, exe, step=None, loss_name=avg_cost.name) np.save('pact_thres.npy', values) _logger.info(values) _logger.info("PACT threshold have been saved as pact_thres.npy") # Draw Histogram in 'dist_pdf/result.pdf' # var_collector.pdf(values) return values = defaultdict(lambda: 20) try: values = np.load("pact_thres.npy", allow_pickle=True).item() values.update(tmp) _logger.info("pact_thres.npy info loaded.") except: _logger.info( "cannot find pact_thres.npy. Set init PACT threshold as 20.") _logger.info(values) # 1. quantization configs quant_config = { # weight quantize type, default is 'channel_wise_abs_max' 'weight_quantize_type': 'channel_wise_abs_max', # activation quantize type, default is 'moving_average_abs_max' 'activation_quantize_type': 'moving_average_abs_max', # weight quantize bit num, default is 8 'weight_bits': 8, # activation quantize bit num, default is 8 'activation_bits': 8, # ops of name_scope in not_quant_pattern list, will not be quantized 'not_quant_pattern': ['skip_quant'], # ops of type in quantize_op_types, will be quantized 'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'], # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8' 'dtype': 'int8', # window size for 'range_abs_max' quantization. defaulf is 10000 'window_size': 10000, # The decay coefficient of moving average, default is 0.9 'moving_rate': 0.9, } # 2. quantization transform programs (training aware) # Make some quantization transforms in the graph before training and testing. # According to the weight and activation quantization type, the graph will be added # some fake quantize operators and fake dequantize operators. def pact(x): helper = LayerHelper("pact", **locals()) dtype = 'float32' init_thres = values[x.name.split('_tmp_input')[0]] u_param_attr = paddle.ParamAttr( name=x.name + '_pact', initializer=paddle.nn.initializer.Constant(value=init_thres), regularizer=paddle.regularizer.L2Decay(0.0001), learning_rate=1) u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype) part_a = paddle.nn.functional.relu(x - u_param) part_b = paddle.nn.functional.relu(-u_param - x) x = x - part_a + part_b return x def get_optimizer(): return paddle.optimizer.Momentum(args.lr, 0.9) if args.use_pact: act_preprocess_func = pact optimizer_func = get_optimizer executor = exe else: act_preprocess_func = None optimizer_func = None executor = None val_program = quant_aware(val_program, place, quant_config, scope=None, act_preprocess_func=act_preprocess_func, optimizer_func=optimizer_func, executor=executor, for_test=True) compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, act_preprocess_func=act_preprocess_func, optimizer_func=optimizer_func, executor=executor, for_test=False) assert os.path.exists( args.pretrained_model), "pretrained_model doesn't exist" if args.pretrained_model: paddle.static.load(train_prog, args.pretrained_model, exe) def test(epoch, program): batch_id = 0 acc_top1_ns = [] acc_top5_ns = [] for data in valid_loader(): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {:.6f}; acc_top5: {:.6f}; time: {:.3f}" .format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) batch_id += 1 _logger.info( "Final eval epoch[{}] - acc_top1: {:.6f}; acc_top5: {:.6f}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) return np.mean(np.array(acc_top1_ns)) def train(epoch, compiled_train_prog, lr): batch_id = 0 for data in train_loader(): start_time = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( compiled_train_prog, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) end_time = time.time() loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {:.6f}; acc_top1: {:.6f}; acc_top5: {:.6f}; time: {:.3f}" .format(epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, end_time - start_time)) if args.use_pact and batch_id % 1000 == 0: threshold = {} for var in val_program.list_vars(): if 'pact' in var.name: array = np.array(paddle.static.global_scope().find_var( var.name).get_tensor()) threshold[var.name] = array[0] _logger.info(threshold) batch_id += 1 lr.step() build_strategy = paddle.static.BuildStrategy() build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False exec_strategy = paddle.static.ExecutionStrategy() compiled_train_prog = compiled_train_prog.with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) # train loop best_acc1 = 0.0 best_epoch = 0 start_epoch = 0 if args.checkpoint_dir is not None: ckpt_path = args.checkpoint_dir assert args.checkpoint_epoch is not None, "checkpoint_epoch must be set" start_epoch = args.checkpoint_epoch paddle.static.load(executor=exe, model_path=args.checkpoint_dir, program=val_program) best_eval_acc1 = 0 best_acc1_epoch = 0 for i in range(start_epoch, args.num_epochs): train(i, compiled_train_prog, learning_rate) acc1 = test(i, val_program) if acc1 > best_eval_acc1: best_eval_acc1 = acc1 best_acc1_epoch = i _logger.info("Best Validation Acc1: {:.6f}, at epoch {}".format( best_eval_acc1, best_acc1_epoch)) paddle.static.save(model_path=os.path.join(args.output_dir, str(i)), program=val_program) if acc1 > best_acc1: best_acc1 = acc1 best_epoch = i paddle.static.save(model_path=os.path.join(args.output_dir, 'best_model'), program=val_program) if os.path.exists(os.path.join(args.output_dir, 'best_model.pdparams')): paddle.static.load(executor=exe, model_path=os.path.join(args.output_dir, 'best_model'), program=val_program) # 3. Freeze the graph after training by adjusting the quantize # operators' order for the inference. # The dtype of float_program's weights is float32, but in int8 range. float_program, int8_program = convert(val_program, place, quant_config, \ scope=None, \ save_int8=True) _logger.info("eval best_model after convert") final_acc1 = test(best_epoch, float_program) _logger.info("final acc:{}".format(final_acc1)) # 4. Save inference model model_path = os.path.join( quantization_model_save_dir, args.model, 'act_' + quant_config['activation_quantize_type'] + '_w_' + quant_config['weight_quantize_type']) float_path = os.path.join(model_path, 'float') if not os.path.isdir(model_path): os.makedirs(model_path) paddle.fluid.io.save_inference_model(dirname=float_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=float_program, model_filename=float_path + '/model', params_filename=float_path + '/params')
def compress(args): train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST(mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST(mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data(name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = int(args.batch_size / len(places)) train_loader = paddle.io.DataLoader(train_dataset, places=places, feed_list=[image, label], drop_last=True, batch_size=batch_size_per_card, shuffle=True, return_list=False, use_shared_memory=True, num_workers=16) valid_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=batch_size_per_card, shuffle=False) step_per_epoch = int(np.ceil(len(train_dataset) * 1. / args.batch_size)) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) opt, learning_rate = create_optimizer(args, step_per_epoch) opt.minimize(avg_cost) exe.run(paddle.static.default_startup_program()) if args.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) _logger.info("Load pretrained model from {}".format( args.pretrained_model)) paddle.static.load(paddle.static.default_main_program(), args.pretrained_model, exe) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info( "Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) def train(epoch, program): for batch_id, data in enumerate(train_loader): start_time = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( train_program, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) end_time = time.time() loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, end_time - start_time)) learning_rate.step() batch_id += 1 test(0, val_program) params = get_pruned_params(args, paddle.static.default_main_program()) _logger.info("FLOPs before pruning: {}".format( flops(paddle.static.default_main_program()))) pruner = Pruner(args.criterion) pruned_val_program, _, _ = pruner.prune(val_program, paddle.static.global_scope(), params=params, ratios=[args.pruned_ratio] * len(params), place=place, only_graph=True) pruned_program, _, _ = pruner.prune(paddle.static.default_main_program(), paddle.static.global_scope(), params=params, ratios=[args.pruned_ratio] * len(params), place=place) _logger.info("FLOPs after pruning: {}".format(flops(pruned_program))) build_strategy = paddle.static.BuildStrategy() exec_strategy = paddle.static.ExecutionStrategy() train_program = paddle.static.CompiledProgram( pruned_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) for i in range(args.num_epochs): train(i, train_program) if (i + 1) % args.test_period == 0: test(i, pruned_val_program) save_model(exe, pruned_val_program, os.path.join(args.model_path, str(i))) if args.save_inference: infer_model_path = os.path.join(args.model_path, "infer_models", str(i)) paddle.static.save_inference_model(infer_model_path, [image], [out], exe, program=pruned_val_program) _logger.info( "Saved inference model into [{}]".format(infer_model_path))
def compress(args): test_reader = None if args.data == "imagenet": import imagenet_reader as reader val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 elif args.data == "cifar10": normalize = T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], data_format='CHW') transform = T.Compose([T.Transpose(), normalize]) val_dataset = paddle.vision.datasets.Cifar10(mode='test', backend='cv2', transform=transform) class_dim = 10 else: raise ValueError("{} is not supported.".format(args.data)) places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() valid_loader = paddle.io.DataLoader(val_dataset, places=places, drop_last=False, return_list=True, batch_size=args.batch_size, shuffle=False, use_shared_memory=True) # model definition model = mobilenet_v1(num_classes=class_dim, pretrained=True) def test(epoch): model.eval() acc_top1_ns = [] acc_top5_ns = [] for batch_id, data in enumerate(valid_loader): start_time = time.time() x_data = data[0] y_data = paddle.to_tensor(data[1]) if args.data == 'cifar10': y_data = paddle.unsqueeze(y_data, 1) logits = model(x_data) loss = F.cross_entropy(logits, y_data) acc_top1 = paddle.metric.accuracy(logits, y_data, k=1) acc_top5 = paddle.metric.accuracy(logits, y_data, k=5) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, np.mean(acc_top1.numpy()), np.mean(acc_top5.numpy()), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1.numpy())) acc_top5_ns.append(np.mean(acc_top5.numpy())) _logger.info( "Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns, dtype="object")), np.mean(np.array(acc_top5_ns, dtype="object")))) model.set_state_dict(paddle.load(args.pruned_model)) _logger.info("The current sparsity of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse(model), 2))) test(0)
def search_mobilenetv2(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client rl_nas = RLNAS( key='ddpg', configs=config, is_sync=False, obs_dim=26, ### step + length_of_token server_addr=(args.server_address, args.port)) else: ### start a client rl_nas = RLNAS(key='ddpg', configs=config, is_sync=False, obs_dim=26, server_addr=(args.server_address, args.port), is_server=False) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') for step in range(args.search_steps): if step == 0: action_prev = [1. for _ in rl_nas.range_tables] else: action_prev = rl_nas.tokens[0] obs = [step] obs.extend(action_prev) archs = rl_nas.next_archs(obs=obs)[0][0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) obs = np.expand_dims(obs, axis=0).astype('float32') actions = rl_nas.tokens obs_next = [step + 1] obs_next.extend(actions[0]) obs_next = np.expand_dims(obs_next, axis=0).astype('float32') if step == args.search_steps - 1: terminal = np.expand_dims([True], axis=0).astype(np.bool) else: terminal = np.expand_dims([False], axis=0).astype(np.bool) rl_nas.reward(np.expand_dims(np.float32(finally_reward[1]), axis=0), obs=obs, actions=actions.astype('float32'), obs_next=obs_next, terminal=terminal) if step == 2: sys.exit(0)
def compress(args): ############################################################################################################ # 1. quantization configs ############################################################################################################ quant_config = { # weight quantize type, default is 'channel_wise_abs_max' 'weight_quantize_type': 'channel_wise_abs_max', # activation quantize type, default is 'moving_average_abs_max' 'activation_quantize_type': 'moving_average_abs_max', # weight quantize bit num, default is 8 'weight_bits': 8, # activation quantize bit num, default is 8 'activation_bits': 8, # ops of name_scope in not_quant_pattern list, will not be quantized 'not_quant_pattern': ['skip_quant'], # ops of type in quantize_op_types, will be quantized 'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'], # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8' 'dtype': 'int8', # window size for 'range_abs_max' quantization. defaulf is 10000 'window_size': 10000, # The decay coefficient of moving average, default is 0.9 'moving_rate': 0.9, } if args.data == "mnist": train_dataset = paddle.vision.datasets.MNIST(mode='train') val_dataset = paddle.vision.datasets.MNIST(mode='test') class_dim = 10 image_shape = "1,28,28" elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) image = paddle.static.data(name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) train_prog = paddle.static.default_main_program() val_program = paddle.static.default_main_program().clone(for_test=True) place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() ############################################################################################################ # 2. quantization transform programs (training aware) # Make some quantization transforms in the graph before training and testing. # According to the weight and activation quantization type, the graph will be added # some fake quantize operators and fake dequantize operators. ############################################################################################################ val_program = quant_aware(val_program, place, quant_config, scope=None, for_test=True) compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False) opt = create_optimizer(args) opt.minimize(avg_cost) exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) assert os.path.exists( args.pretrained_model), "pretrained_model doesn't exist" if args.pretrained_model: paddle.static.load(train_prog, args.pretrained_model, exe) places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() train_loader = paddle.io.DataLoader(train_dataset, places=places, feed_list=[image, label], drop_last=True, batch_size=args.batch_size, return_list=False, use_shared_memory=True, shuffle=True, num_workers=4) valid_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, batch_size=args.batch_size, use_shared_memory=True, shuffle=False) def test(epoch, program): batch_id = 0 acc_top1_ns = [] acc_top5_ns = [] for data in valid_loader(): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) batch_id += 1 _logger.info( "Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) return np.mean(np.array(acc_top1_ns)) def train(epoch, compiled_train_prog): batch_id = 0 for data in train_loader(): start_time = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( compiled_train_prog, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) end_time = time.time() loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] - loss: {}; acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, loss_n, acc_top1_n, acc_top5_n, end_time - start_time)) batch_id += 1 build_strategy = paddle.static.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False build_strategy.sync_batch_norm = False exec_strategy = paddle.static.ExecutionStrategy() compiled_train_prog = compiled_train_prog.with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) ############################################################################################################ # train loop ############################################################################################################ best_acc1 = 0.0 best_epoch = 0 for i in range(args.num_epochs): train(i, compiled_train_prog) acc1 = test(i, val_program) paddle.static.save(program=val_program, model_path=os.path.join(args.checkpoint_dir, str(i))) if acc1 > best_acc1: best_acc1 = acc1 best_epoch = i paddle.static.save(program=val_program, model_path=os.path.join(args.checkpoint_dir, 'best_model')) if os.path.exists(os.path.join(args.checkpoint_dir, 'best_model')): paddle.static.load(executor=exe, model_path=os.path.join(args.checkpoint_dir, 'best_model'), program=val_program) ############################################################################################################ # 3. Freeze the graph after training by adjusting the quantize # operators' order for the inference. # The dtype of float_program's weights is float32, but in int8 range. ############################################################################################################ float_program, int8_program = convert(val_program, place, quant_config, \ scope=None, \ save_int8=True) print("eval best_model after convert") final_acc1 = test(best_epoch, float_program) ############################################################################################################ # 4. Save inference model ############################################################################################################ model_path = os.path.join( quantization_model_save_dir, args.model, 'act_' + quant_config['activation_quantize_type'] + '_w_' + quant_config['weight_quantize_type']) float_path = os.path.join(model_path, 'float') if not os.path.isdir(model_path): os.makedirs(model_path) paddle.fluid.io.save_inference_model(dirname=float_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=float_program, model_filename=float_path + '/model', params_filename=float_path + '/params')
def compress(args): shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) args.num_workers = 0 shuffle = False env = os.environ num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1)) use_data_parallel = num_trainers > 1 if use_data_parallel: # Fleet step 1: initialize the distributed environment role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST( mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST( mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" args.pretrained_model = False elif args.data == "cifar10": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10( mode="train", backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.Cifar10( mode="test", backend="cv2", transform=transform) class_dim = 10 image_shape = "3, 32, 32" args.pretrained_model = False elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format(args.model, model_list) if args.use_gpu: places = paddle.static.cuda_places() else: places = paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = args.batch_size batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=batch_size_per_card, shuffle=shuffle, drop_last=True) train_loader = paddle.io.DataLoader( train_dataset, places=place, batch_sampler=batch_sampler, feed_list=[image, label], return_list=False, use_shared_memory=True, num_workers=args.num_workers) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size_for_validation, shuffle=False) step_per_epoch = int( np.ceil(len(train_dataset) * 1. / args.batch_size / num_trainers)) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) if args.data == 'cifar10': label = paddle.reshape(label, [-1, 1]) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) opt, learning_rate = create_optimizer(args, step_per_epoch) # Fleet step 2: distributed strategy if use_data_parallel: dist_strategy = DistributedStrategy() dist_strategy.sync_batch_norm = False dist_strategy.exec_strategy = paddle.static.ExecutionStrategy() dist_strategy.fuse_all_reduce_ops = False train_program = paddle.static.default_main_program() if args.pruning_strategy == 'gmp': # GMP pruner step 0: define configs for GMP, no need to define configs for the base training. configs = { 'stable_iterations': args.stable_epochs * step_per_epoch, 'pruning_iterations': args.pruning_epochs * step_per_epoch, 'tunning_iterations': args.tunning_epochs * step_per_epoch, 'resume_iteration': (args.last_epoch + 1) * step_per_epoch, 'pruning_steps': args.pruning_steps, 'initial_ratio': args.initial_ratio, } elif args.pruning_strategy == 'base': configs = None # GMP pruner step 1: initialize a pruner object by calling entry function. pruner = create_unstructured_pruner( train_program, args, place, configs=configs) if use_data_parallel: # Fleet step 3: decorate the origial optimizer and minimize it opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(avg_cost, no_grad_set=pruner.no_grad_set) exe.run(paddle.static.default_startup_program()) if args.last_epoch > -1: assert args.checkpoint is not None and os.path.exists( args.checkpoint), "Please specify a valid checkpoint path." paddle.fluid.io.load_persistables( executor=exe, dirname=args.checkpoint, main_program=train_program) elif args.pretrained_model: assert os.path.exists( args. pretrained_model), "Pretrained model path {} doesn't exist".format( args.pretrained_model) def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) _logger.info("Load pretrained model from {}".format( args.pretrained_model)) # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. # Please consider using paddle.static.load(program, model_path) when possible paddle.fluid.io.load_vars( exe, args.pretrained_model, predicate=if_exist) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] _logger.info( "The current sparsity of the inference model is {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}". format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) def train(epoch, program): train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start train_start = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) # GMP pruner step 2: step() to update ratios and other internal states of the pruner. pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, train_reader_cost / args.log_period, ( train_reader_cost + train_run_cost ) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost ))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 learning_rate.step() reader_start = time.time() if use_data_parallel: # Fleet step 4: get the compiled program from fleet compiled_train_program = fleet.main_program else: compiled_train_program = paddle.static.CompiledProgram( paddle.static.default_main_program()) for i in range(args.last_epoch + 1, args.num_epochs): train(i, compiled_train_program) # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. pruner.update_params() _logger.info("The current sparsity of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) if (i + 1) % args.test_period == 0: test(i, val_program) if (i + 1) % args.model_period == 0: if use_data_parallel: fleet.save_persistables(executor=exe, dirname=args.model_path) else: paddle.fluid.io.save_persistables( executor=exe, dirname=args.model_path)
def compress(args): train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST(mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST(mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data(name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = int(args.batch_size / len(places)) valid_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=batch_size_per_card, shuffle=False) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) exe.run(paddle.static.default_startup_program()) if args.pruned_model: def if_exist(var): return os.path.exists(os.path.join(args.pruned_model, var.name)) _logger.info("Load pruned model from {}".format(args.pruned_model)) paddle.fluid.io.load_vars(exe, args.pruned_model, predicate=if_exist) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] _logger.info( "The current density of the inference model is {}%".format( round( 100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info( "Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) test(0, val_program)
def eval(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() exe = paddle.static.Executor(place) val_program, feed_target_names, fetch_targets = paddle.fluid.io.load_inference_model( args.model_path, exe, model_filename=args.model_name, params_filename=args.params_name) val_dataset = reader.ImageNetDataset(mode='val') image = paddle.static.data( name='image', shape=[None, 3, 224, 224], dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') val_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=True, batch_size=args.batch_size, use_shared_memory=True, shuffle=False) results = [] for batch_id, data in enumerate(val_loader()): # top1_acc, top5_acc if len(feed_target_names) == 1: # eval "infer model", which input is image, output is classification probability image = data[0] label = data[1] pred = exe.run(val_program, feed={feed_target_names[0]: image}, fetch_list=fetch_targets) pred = np.array(pred[0]) label = np.array(label) sort_array = pred.argsort(axis=1) top_1_pred = sort_array[:, -1:][:, ::-1] top_1 = np.mean(label == top_1_pred) top_5_pred = sort_array[:, -5:][:, ::-1] acc_num = 0 for i in range(len(label)): if label[i][0] in top_5_pred[i]: acc_num += 1 top_5 = float(acc_num) / len(label) results.append([top_1, top_5]) else: # eval "eval model", which inputs are image and label, output is top1 and top5 accuracy image = data[0] label = data[1] result = exe.run(val_program, feed={ feed_target_names[0]: image, feed_target_names[1]: label }, fetch_list=fetch_targets) result = [np.mean(r) for r in result] results.append(result) if batch_id % 100 == 0: print('Eval iter: ', batch_id) result = np.mean(np.array(results), axis=0) print("top1_acc/top5_acc= {}".format(result)) sys.stdout.flush()
def compress(args): train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST(mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST(mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" args.pretrained_model = False elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) if args.use_gpu: places = paddle.static.cuda_places() else: places = paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data(name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = int(args.batch_size / len(places)) train_loader = paddle.io.DataLoader(train_dataset, places=places, feed_list=[image, label], drop_last=True, batch_size=batch_size_per_card, shuffle=True, return_list=False, use_shared_memory=True, num_workers=32) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size_for_validation, shuffle=False) step_per_epoch = int(np.ceil(len(train_dataset) * 1. / args.batch_size)) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) opt, learning_rate = create_optimizer(args, step_per_epoch) opt.minimize(avg_cost) pruner = UnstructuredPruner(paddle.static.default_main_program(), mode=args.pruning_mode, ratio=args.ratio, threshold=args.threshold, place=place) exe.run(paddle.static.default_startup_program()) if args.pretrained_model: assert os.path.exists( args.pretrained_model ), "Pretrained model path {} doesn't exist".format( args.pretrained_model) def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) _logger.info("Load pretrained model from {}".format( args.pretrained_model)) # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. # Please consider using paddle.static.load(program, model_path) when possible paddle.fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] _logger.info( "The current density of the inference model is {}%".format( round( 100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info( "Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) def train(epoch, program): train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start train_start = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( train_program, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec" .format( epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, train_reader_cost / args.log_period, (train_reader_cost + train_run_cost) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 learning_rate.step() reader_start = time.time() build_strategy = paddle.static.BuildStrategy() exec_strategy = paddle.static.ExecutionStrategy() train_program = paddle.static.CompiledProgram( paddle.static.default_main_program()).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) for i in range(args.resume_epoch + 1, args.num_epochs): train(i, train_program) _logger.info("The current density of the pruned model is: {}%".format( round( 100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) if (i + 1) % args.test_period == 0: pruner.update_params() test(i, val_program) if (i + 1) % args.model_period == 0: pruner.update_params() # NOTE: We are using fluid.io.save_params() because the pretrained model is from an older version which requires this API. # Please consider using paddle.static.save(program, model_path) as long as it becomes possible. fluid.io.save_params(executor=exe, dirname=args.model_path)
def compress(args): if args.data == "cifar10": train_dataset = paddle.vision.datasets.Cifar10(mode='train') val_dataset = paddle.vision.datasets.Cifar10(mode='test') class_dim = 10 image_shape = "3,32,32" elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format(args.model, model_list) student_program = paddle.static.Program() s_startup = paddle.static.Program() places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() place = places[0] with paddle.static.program_guard(student_program, s_startup): with paddle.fluid.unique_name.guard(): image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data( name='label', shape=[None, 1], dtype='int64') train_loader = paddle.io.DataLoader( train_dataset, places=places, feed_list=[image, label], drop_last=True, batch_size=args.batch_size, return_list=False, shuffle=True, use_shared_memory=True, num_workers=4) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size, shuffle=False) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = paddle.nn.functional.loss.cross_entropy( input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = student_program.clone(for_test=True) exe = paddle.static.Executor(place) teacher_model = models.__dict__[args.teacher_model]() # define teacher program teacher_program = paddle.static.Program() t_startup = paddle.static.Program() with paddle.static.program_guard(teacher_program, t_startup): with paddle.fluid.unique_name.guard(): image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') predict = teacher_model.net(image, class_dim=class_dim) exe.run(t_startup) if not os.path.exists(args.teacher_pretrained_model): _download( 'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar', '.') _decompress('./ResNet50_vd_pretrained.tar') assert args.teacher_pretrained_model and os.path.exists( args.teacher_pretrained_model ), "teacher_pretrained_model should be set when teacher_model is not None." def if_exist(var): exist = os.path.exists( os.path.join(args.teacher_pretrained_model, var.name)) if args.data == "cifar10" and (var.name == 'fc_0.w_0' or var.name == 'fc_0.b_0'): exist = False return exist paddle.static.load(teacher_program, args.teacher_pretrained_model, exe) data_name_map = {'image': 'image'} merge(teacher_program, student_program, data_name_map, place) with paddle.static.program_guard(student_program, s_startup): distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program) loss = avg_cost + distill_loss lr, opt = create_optimizer(args) opt.minimize(loss) exe.run(s_startup) build_strategy = paddle.static.BuildStrategy() build_strategy.fuse_all_reduce_ops = False parallel_main = paddle.static.CompiledProgram( student_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) for epoch_id in range(args.num_epochs): for step_id, data in enumerate(train_loader): loss_1, loss_2, loss_3 = exe.run( parallel_main, feed=data, fetch_list=[loss.name, avg_cost.name, distill_loss.name]) if step_id % args.log_period == 0: _logger.info( "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}". format(epoch_id, step_id, lr.get_lr(), loss_1[0], loss_2[0], loss_3[0])) lr.step() val_acc1s = [] val_acc5s = [] for step_id, data in enumerate(valid_loader): val_loss, val_acc1, val_acc5 = exe.run( val_program, data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) val_acc1s.append(val_acc1) val_acc5s.append(val_acc5) if step_id % args.log_period == 0: _logger.info( "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}". format(epoch_id, step_id, val_loss[0], val_acc1[0], val_acc5[0])) if args.save_inference: paddle.fluid.io.save_inference_model( os.path.join("./saved_models", str(epoch_id)), ["image"], [out], exe, student_program) _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format( epoch_id, np.mean(val_acc1s), np.mean(val_acc5s)))
def search_mobilenetv2_block(config, args, image_size): image_shape = [3, image_size, image_size] transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) if args.data == 'cifar10': train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if args.is_server: sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) else: sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=False) for step in range(args.search_steps): archs = sa_nas.next_archs()[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() with static.program_guard(train_program, startup_program): data_shape = [None] + image_shape data = static.data(name='data', shape=data_shape, dtype='float32') label = static.data(name='label', shape=[None, 1], dtype='int64') if args.data == 'cifar10': paddle.assign(paddle.reshape(label, [-1, 1]), label) train_loader = paddle.io.DataLoader(train_dataset, places=places, feed_list=[data, label], drop_last=True, batch_size=args.batch_size, return_list=False, shuffle=True, use_shared_memory=True, num_workers=4) val_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[data, label], drop_last=False, batch_size=args.batch_size, return_list=False, shuffle=False) data = conv_bn_layer(input=data, num_filters=32, filter_size=3, stride=2, padding='SAME', act='relu6', name='mobilenetv2_conv1') data = archs(data)[0] data = conv_bn_layer(input=data, num_filters=1280, filter_size=1, stride=1, padding='SAME', act='relu6', name='mobilenetv2_last_conv') data = F.adaptive_avg_pool2d(data, output_size=[1, 1], name='mobilenetv2_last_pool') output = static.nn.fc( x=data, size=args.class_dim, weight_attr=ParamAttr(name='mobilenetv2_fc_weights'), bias_attr=ParamAttr(name='mobilenetv2_fc_offset')) softmax_out = F.softmax(output) cost = F.cross_entropy(softmax_out, label=label) avg_cost = paddle.mean(cost) acc_top1 = paddle.metric.accuracy(input=softmax_out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=softmax_out, label=label, k=5) test_program = train_program.clone(for_test=True) optimizer = paddle.optimizer.Momentum( learning_rate=0.1, momentum=0.9, weight_decay=paddle.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) current_flops = flops(train_program) print('step: {}, current_flops: {}'.format(step, current_flops)) if current_flops > int(321208544): continue exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(val_loader()): test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) sa_nas.reward(float(finally_reward[1]))
def compress(args): test_reader = None if args.data == "mnist": val_dataset = paddle.vision.datasets.MNIST(mode='test') class_dim = 10 image_shape = "1,28,28" elif args.data == "imagenet": import imagenet_reader as reader val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) image = paddle.static.data(name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) if args.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) paddle.fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist) valid_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[image, label], drop_last=False, batch_size=args.batch_size, use_shared_memory=True, shuffle=False) def test(program): acc_top1_ns = [] acc_top5_ns = [] for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval batch[{}] - acc_top1: {}; acc_top5: {}; time: {}". format(batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) batch_id += 1 _logger.info("Final eva - acc_top1: {}; acc_top5: {}".format( np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) return np.mean(np.array(acc_top1_ns)) params = [] for param in paddle.static.default_main_program().global_block( ).all_parameters(): if "weights" in param.name: print(param.name) params.append(param.name) sensitivity(val_program, place, params, test, sensitivities_file="sensitivities_0.data", pruned_ratios=[0.1, 0.2, 0.3, 0.4]) sensitivity(val_program, place, params, test, sensitivities_file="sensitivities_1.data", pruned_ratios=[0.5, 0.6, 0.7]) sens = merge_sensitive( ["./sensitivities_0.data", "./sensitivities_1.data"]) ratios = get_ratios_by_loss(sens, 0.01) print(sens)
def search_mobilenetv2(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, lstm_num_layers=1, hidden_size=10, temperature=1.0) else: ### start a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), lstm_num_layers=1, hidden_size=10, temperature=1.0, controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, is_server=False) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') for step in range(args.search_steps): archs = rl_nas.next_archs(1)[0][0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) rl_nas.reward(np.float32(finally_reward[1]))
def test_search_result(tokens, image_size, args, config): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') archs = sa_nas.tokens2arch(tokens)[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) current_flops = flops(train_program) print('current_flops: {}'.format(current_flops)) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: epoch: {}, batch: {}, cost: {}, batch_time: {}ms'. format(epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'. format(batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2]))
def compress(args): shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) args.num_workers = 0 shuffle = False if args.use_gpu: place = paddle.set_device('gpu') else: place = paddle.set_device('cpu') trainer_num = paddle.distributed.get_world_size() use_data_parallel = trainer_num != 1 if use_data_parallel: dist.init_parallel_env() train_reader = None test_reader = None if args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 elif args.data == "cifar10": normalize = T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], data_format='CHW') transform = T.Compose([T.Transpose(), normalize]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', backend='cv2', transform=transform) val_dataset = paddle.vision.datasets.Cifar10(mode='test', backend='cv2', transform=transform) class_dim = 10 else: raise ValueError("{} is not supported.".format(args.data)) batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=shuffle, drop_last=True) train_loader = paddle.io.DataLoader(train_dataset, places=place, batch_sampler=batch_sampler, return_list=True, num_workers=args.num_workers, use_shared_memory=True) valid_loader = paddle.io.DataLoader( val_dataset, places=place, drop_last=False, return_list=True, batch_size=args.batch_size_for_validation, shuffle=False, use_shared_memory=True) step_per_epoch = int( np.ceil(len(train_dataset) / args.batch_size / ParallelEnv().nranks)) # model definition model = mobilenet_v1(num_classes=class_dim, pretrained=True) if ParallelEnv().nranks > 1: model = paddle.DataParallel(model) opt, learning_rate = create_optimizer(args, step_per_epoch, model) if args.checkpoint is not None and args.last_epoch > -1: if args.checkpoint.endswith('pdparams'): args.checkpoint = args.checkpoint[:-9] if args.checkpoint.endswith('pdopt'): args.checkpoint = args.checkpoint[:-6] model.set_state_dict(paddle.load(args.checkpoint + ".pdparams")) opt.set_state_dict(paddle.load(args.checkpoint + ".pdopt")) elif args.pretrained_model is not None: if args.pretrained_model.endswith('pdparams'): args.pretrained_model = args.pretrained_model[:-9] if args.pretrained_model.endswith('pdopt'): args.pretrained_model = args.pretrained_model[:-6] model.set_state_dict(paddle.load(args.pretrained_model + ".pdparams")) if args.pruning_strategy == 'gmp': # GMP pruner step 0: define configs. No need to do this if you are not using 'gmp' configs = { 'stable_iterations': args.stable_epochs * step_per_epoch, 'pruning_iterations': args.pruning_epochs * step_per_epoch, 'tunning_iterations': args.tunning_epochs * step_per_epoch, 'resume_iteration': (args.last_epoch + 1) * step_per_epoch, 'pruning_steps': args.pruning_steps, 'initial_ratio': args.initial_ratio, } else: configs = None # GMP pruner step 1: initialize a pruner object pruner = create_unstructured_pruner(model, args, configs=configs) def test(epoch): model.eval() acc_top1_ns = [] acc_top5_ns = [] for batch_id, data in enumerate(valid_loader): start_time = time.time() x_data = data[0] y_data = paddle.to_tensor(data[1]) if args.data == 'cifar10': y_data = paddle.unsqueeze(y_data, 1) logits = model(x_data) loss = F.cross_entropy(logits, y_data) acc_top1 = paddle.metric.accuracy(logits, y_data, k=1) acc_top5 = paddle.metric.accuracy(logits, y_data, k=5) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}" .format(epoch, batch_id, np.mean(acc_top1.numpy()), np.mean(acc_top5.numpy()), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1.numpy())) acc_top5_ns.append(np.mean(acc_top5.numpy())) _logger.info( "Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns, dtype="object")), np.mean(np.array(acc_top5_ns, dtype="object")))) def train(epoch): model.train() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start x_data = data[0] y_data = paddle.to_tensor(data[1]) if args.data == 'cifar10': y_data = paddle.unsqueeze(y_data, 1) train_start = time.time() logits = model(x_data) loss = F.cross_entropy(logits, y_data) acc_top1 = paddle.metric.accuracy(logits, y_data, k=1) acc_top5 = paddle.metric.accuracy(logits, y_data, k=5) loss.backward() opt.step() learning_rate.step() opt.clear_grad() # GMP pruner step 2: step() to update ratios and other internal states of the pruner. pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec" .format( epoch, batch_id, opt.get_lr(), np.mean(loss.numpy()), np.mean(acc_top1.numpy()), np.mean(acc_top5.numpy()), train_reader_cost / args.log_period, (train_reader_cost + train_run_cost) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for i in range(args.last_epoch + 1, args.num_epochs): train(i) # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. pruner.update_params() if (i + 1) % args.test_period == 0: _logger.info( "The current sparsity of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse(model), 2))) test(i) if (i + 1) % args.model_period == 0: pruner.update_params() paddle.save(model.state_dict(), os.path.join(args.model_path, "model.pdparams")) paddle.save(opt.state_dict(), os.path.join(args.model_path, "model.pdopt"))
def compress(args): if args.data == "cifar10": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode="train", backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.Cifar10(mode="test", backend="cv2", transform=transform) class_dim = 10 image_shape = [3, 32, 32] pretrain = False args.total_images = 50000 elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) trainer_num = paddle.distributed.get_world_size() use_data_parallel = trainer_num != 1 place = paddle.set_device('gpu' if args.use_gpu else 'cpu') # model definition if use_data_parallel: paddle.distributed.init_parallel_env() pretrain = True if args.data == "imagenet" else False if args.model == "mobilenet_v1": net = mobilenet_v1(pretrained=pretrain, num_classes=class_dim) elif args.model == "mobilenet_v3": net = MobileNetV3_large_x1_0(class_dim=class_dim) if pretrain: load_dygraph_pretrain(net, args.pretrained_model, True) else: raise ValueError("{} is not supported.".format(args.model)) _logger.info("Origin model summary:") paddle.summary(net, (1, 3, 224, 224)) ############################################################################################################ # 1. quantization configs ############################################################################################################ quant_config = { # weight preprocess type, default is None and no preprocessing is performed. 'weight_preprocess_type': None, # activation preprocess type, default is None and no preprocessing is performed. 'activation_preprocess_type': None, # weight quantize type, default is 'channel_wise_abs_max' 'weight_quantize_type': 'channel_wise_abs_max', # activation quantize type, default is 'moving_average_abs_max' 'activation_quantize_type': 'moving_average_abs_max', # weight quantize bit num, default is 8 'weight_bits': 8, # activation quantize bit num, default is 8 'activation_bits': 8, # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8' 'dtype': 'int8', # window size for 'range_abs_max' quantization. default is 10000 'window_size': 10000, # The decay coefficient of moving average, default is 0.9 'moving_rate': 0.9, # for dygraph quantization, layers of type in quantizable_layer_type will be quantized 'quantizable_layer_type': ['Conv2D', 'Linear'], } if args.use_pact: quant_config['activation_preprocess_type'] = 'PACT' ############################################################################################################ # 2. Quantize the model with QAT (quant aware training) ############################################################################################################ quanter = QAT(config=quant_config) quanter.quantize(net) _logger.info("QAT model summary:") paddle.summary(net, (1, 3, 224, 224)) opt, lr = create_optimizer(net, trainer_num, args) if use_data_parallel: net = paddle.DataParallel(net) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(train_dataset, batch_sampler=train_batch_sampler, places=place, return_list=True, num_workers=4) valid_loader = paddle.io.DataLoader(val_dataset, places=place, batch_size=args.batch_size, shuffle=False, drop_last=False, return_list=True, num_workers=4) @paddle.no_grad() def test(epoch, net): net.eval() batch_id = 0 acc_top1_ns = [] acc_top5_ns = [] eval_reader_cost = 0.0 eval_run_cost = 0.0 total_samples = 0 reader_start = time.time() for data in valid_loader(): eval_reader_cost += time.time() - reader_start image = data[0] label = data[1] if args.data == "cifar10": label = paddle.reshape(label, [-1, 1]) eval_start = time.time() out = net(image) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) eval_run_cost += time.time() - eval_start batch_size = image.shape[0] total_samples += batch_size if batch_id % args.log_period == 0: log_period = 1 if batch_id == 0 else args.log_period _logger.info( "Eval epoch[{}] batch[{}] - top1: {:.6f}; top5: {:.6f}; avg_reader_cost: {:.6f} s, avg_batch_cost: {:.6f} s, avg_samples: {}, avg_ips: {:.3f} images/s" .format(epoch, batch_id, np.mean(acc_top1.numpy()), np.mean(acc_top5.numpy()), eval_reader_cost / log_period, (eval_reader_cost + eval_run_cost) / log_period, total_samples / log_period, total_samples / (eval_reader_cost + eval_run_cost))) eval_reader_cost = 0.0 eval_run_cost = 0.0 total_samples = 0 acc_top1_ns.append(np.mean(acc_top1.numpy())) acc_top5_ns.append(np.mean(acc_top5.numpy())) batch_id += 1 reader_start = time.time() _logger.info( "Final eval epoch[{}] - acc_top1: {:.6f}; acc_top5: {:.6f}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) return np.mean(np.array(acc_top1_ns)) def cross_entropy(input, target, ls_epsilon): if ls_epsilon > 0: if target.shape[-1] != class_dim: target = paddle.nn.functional.one_hot(target, class_dim) target = paddle.nn.functional.label_smooth(target, epsilon=ls_epsilon) target = paddle.reshape(target, shape=[-1, class_dim]) input = -paddle.nn.functional.log_softmax(input, axis=-1) cost = paddle.sum(target * input, axis=-1) else: cost = paddle.nn.functional.cross_entropy(input=input, label=target) avg_cost = paddle.mean(cost) return avg_cost def train(epoch, net): net.train() batch_id = 0 train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for data in train_loader(): train_reader_cost += time.time() - reader_start image = data[0] label = data[1] if args.data == "cifar10": label = paddle.reshape(label, [-1, 1]) train_start = time.time() out = net(image) avg_cost = cross_entropy(out, label, args.ls_epsilon) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) avg_cost.backward() opt.step() opt.clear_grad() lr.step() loss_n = np.mean(avg_cost.numpy()) acc_top1_n = np.mean(acc_top1.numpy()) acc_top5_n = np.mean(acc_top5.numpy()) train_run_cost += time.time() - train_start batch_size = image.shape[0] total_samples += batch_size if batch_id % args.log_period == 0: log_period = 1 if batch_id == 0 else args.log_period _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {:.6f}; top1: {:.6f}; top5: {:.6f}; avg_reader_cost: {:.6f} s, avg_batch_cost: {:.6f} s, avg_samples: {}, avg_ips: {:.3f} images/s" .format( epoch, batch_id, lr.get_lr(), loss_n, acc_top1_n, acc_top5_n, train_reader_cost / log_period, (train_reader_cost + train_run_cost) / log_period, total_samples / log_period, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 batch_id += 1 reader_start = time.time() ############################################################################################################ # train loop ############################################################################################################ best_acc1 = 0.0 best_epoch = 0 for i in range(args.num_epochs): train(i, net) acc1 = test(i, net) if paddle.distributed.get_rank() == 0: model_prefix = os.path.join(args.model_save_dir, "epoch_" + str(i)) paddle.save(net.state_dict(), model_prefix + ".pdparams") paddle.save(opt.state_dict(), model_prefix + ".pdopt") if acc1 > best_acc1: best_acc1 = acc1 best_epoch = i if paddle.distributed.get_rank() == 0: model_prefix = os.path.join(args.model_save_dir, "best_model") paddle.save(net.state_dict(), model_prefix + ".pdparams") paddle.save(opt.state_dict(), model_prefix + ".pdopt") ############################################################################################################ # 3. Save quant aware model ############################################################################################################ if paddle.distributed.get_rank() == 0: # load best model load_dygraph_pretrain(net, os.path.join(args.model_save_dir, "best_model")) path = os.path.join(args.model_save_dir, "inference_model", 'qat_model') quanter.save_quantized_model(net, path, input_spec=[ paddle.static.InputSpec( shape=[None, 3, 224, 224], dtype='float32') ])