def restore(self, model, opt=None): for name, sub_layer in model.named_sublayers(include_self=True): for param in sub_layer.parameters(include_sublayers=False): # restore optimizer accumulators from layer buffer self._restore_opt(param.name, sub_layer, opt) backup_name = "_".join( [param.name.replace(".", "_"), "backup"]) if backup_name in sub_layer._buffers: _logger.debug("Restore values of variable: {}".format( param.name)) t_value = param.value().get_tensor() t_backup = sub_layer._buffers[backup_name].value( ).get_tensor() p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: p = core.Place() p.set_place(t_value._place()) place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(np.array(t_backup).astype("float32"), place) if "_origin_groups" in sub_layer.__dict__: sub_layer._groups = sub_layer._origin_groups del sub_layer._buffers[backup_name]
def _prune_opt(self, param_name, dims, bool_mask, opt): if opt is None: return for k, v in opt._accumulators.items(): var_tmp = v.get(param_name) #NOTE: var_tmp.shape == [1] is used to skip variables like beta1_pow_acc in Adam optimizer. Its shape is [1] and there's no need to prune this one-value variable. if var_tmp is None or var_tmp.shape == [1]: if var_tmp is not None: print(var_tmp.name, var_tmp.shape) continue t_value = var_tmp.value().get_tensor() value = np.array(t_value).astype("float32") pruned_value = np.apply_along_axis(lambda data: data[bool_mask], dims, value) p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: p = core.Place() p.set_place(t_value._place()) place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(pruned_value, place)
def lazy_apply(self, model): for name, sub_layer in model.named_sublayers(): for param in sub_layer.parameters(include_sublayers=False): if param.name in self._masks: for _mask in self._masks[param.name]: dims = _mask.dims mask = _mask.mask t_value = param.value().get_tensor() value = np.array(t_value).astype("float32") # The name of buffer can not contains "." backup_name = param.name.replace(".", "_") + "_backup" if backup_name not in sub_layer._buffers: sub_layer.register_buffer(backup_name, paddle.to_tensor(value)) _logger.debug( "Backup values of {} into buffers.".format( param.name)) expand_mask_shape = [1] * len(value.shape) expand_mask_shape[dims] = value.shape[dims] _logger.debug("Expanded mask shape: {}".format( expand_mask_shape)) expand_mask = mask.reshape(expand_mask_shape).astype( "float32") p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: p = core.Place() p.set_place(t_value._place()) place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(value * expand_mask, place)
def restore(self, model): for name, sub_layer in model.named_sublayers(): for param in sub_layer.parameters(include_sublayers=False): backup_name = "_".join( [param.name.replace(".", "_"), "backup"]) if backup_name in sub_layer._buffers: _logger.debug("Restore values of variable: {}".format( param.name)) t_value = param.value().get_tensor() t_backup = sub_layer._buffers[backup_name].value( ).get_tensor() p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: p = core.Place() p.set_place(t_value._place()) place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(np.array(t_backup).astype("float32"), place) if isinstance(sub_layer, paddle.nn.layer.conv.Conv2D): if sub_layer._groups > 1: _logger.debug( "Update groups of conv form {} to {}".format( sub_layer._groups, t_value.shape()[0])) sub_layer._groups = t_value.shape()[0] del sub_layer._buffers[backup_name]
def _restore_opt(self, param_name, sub_layer, opt): if opt is None: return for k, v in opt._accumulators.items(): var_tmp = v.get(param_name) if var_tmp is None: continue backup_name = var_tmp.name.replace(".", "_") + "_backup" if backup_name in sub_layer._buffers: _logger.debug("Restore values of variable: {}".format( var_tmp.name)) t_value = var_tmp.value().get_tensor() t_backup = sub_layer._buffers[backup_name].value().get_tensor() p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: p = core.Place() p.set_place(t_value._place()) place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(np.array(t_backup).astype("float32"), place) del sub_layer._buffers[backup_name]
def imperative_apply(self, model, opt=None): """ Pruning values of variable imperatively. It is valid when pruning on one dimension. """ for name, sub_layer in model.named_sublayers(include_self=True): for param in sub_layer.parameters(include_sublayers=False): if param.name in self._masks: for _mask in self._masks[param.name]: dims = _mask.dims assert (isinstance(dims, int)) mask = _mask.mask bool_mask = np.array(mask).astype(bool) t_value = param.value().get_tensor() value = np.array(t_value).astype("float32") groups = _mask._op.attr('groups') if dims == 1 and groups is not None and groups > 1 and len( value.shape) == 4: filter_size = value.shape[1] except_num = np.sum(bool_mask) assert (except_num % filter_size == 0) new_groups = int(except_num / filter_size) sub_layer._origin_groups = sub_layer._groups sub_layer._groups = new_groups _logger.info( "change groups from {} to {} for {}.".format( groups, new_groups, param.name)) continue # The name of buffer can not contains "." backup_name = param.name.replace(".", "_") + "_backup" if backup_name not in sub_layer._buffers: sub_layer.register_buffer(backup_name, paddle.to_tensor(value)) _logger.debug( "Backup values of {} into buffers.".format( param.name)) # save optimizer accumulators into layer buffer self._buffer_opt(param.name, sub_layer, opt) pruned_value = np.apply_along_axis( lambda data: data[bool_mask], dims, value) self._prune_opt(param.name, dims, bool_mask, opt) p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: p = core.Place() p.set_place(t_value._place()) place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(pruned_value, place) # for training if param.trainable: param.clear_gradient()
def test_cuda_pinned_place(self): with paddle.fluid.dygraph.guard(): x = paddle.to_tensor( np.random.randn(2, 10), place=paddle.CUDAPinnedPlace()) self.assertTrue(x.place.is_cuda_pinned_place()) y = x[:, ::2] self.assertFalse(x.place.is_cuda_pinned_place()) self.assertFalse(y.place.is_cuda_pinned_place())
def test_skip_data_transform(self): paddle.disable_static() with _test_eager_guard(): x = paddle.to_tensor([1., 2., 3., 4.], place=paddle.CUDAPinnedPlace()) out = paddle.full_like(x, 1.) self.assertTrue( (out.numpy() == np.ones([4]).astype(np.float32)).all(), True) paddle.enable_static()
def imperative_apply(self, model): """ Pruning values of variable imperatively. It is valid when pruning on one dimension. """ for name, sub_layer in model.named_sublayers(): for param in sub_layer.parameters(include_sublayers=False): if param.name in self._masks: for _mask in self._masks[param.name]: dims = _mask.dims mask = _mask.mask assert len( dims ) == 1, "Imperative mode only support for pruning on one dimension, but get dims {} when pruning parameter {}".format( dims, param.name) t_value = param.value().get_tensor() value = np.array(t_value).astype("float32") # The name of buffer can not contains "." backup_name = param.name.replace(".", "_") + "_backup" if backup_name not in sub_layer._buffers: sub_layer.register_buffer(backup_name, paddle.to_tensor(value)) _logger.debug( "Backup values of {} into buffers.".format( param.name)) bool_mask = np.array(mask).astype(bool) pruned_value = np.apply_along_axis( lambda data: data[bool_mask], dims[0], value) p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: p = core.Place() p.set_place(t_value._place()) place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(pruned_value, place) if isinstance(sub_layer, paddle.nn.layer.conv.Conv2D ) and sub_layer._groups > 1 and len( param.shape) == 4: assert param.shape[ 1] == 1, "It just supports depthwise conv2d when groups > 1." new_groups = int(bool_mask.sum() * sub_layer._groups / len(bool_mask)) _logger.debug( "Update groups of depthwise conv2d form {} to {}" .format(sub_layer._groups, new_groups)) sub_layer._origin_groups = sub_layer._groups sub_layer._groups = new_groups # for training if param.trainable: param.clear_gradient()
def test_strided_slice_tensor_array_cuda_pinned_place(self): if paddle.device.is_compiled_with_cuda(): with paddle.fluid.dygraph.guard(): class Simple(paddle.nn.Layer): def __init__(self): super(Simple, self).__init__() def forward(self, inps): tensor_array = None for i, tensor in enumerate(inps): index = paddle.full( shape=[1], dtype='int64', fill_value=i) if tensor_array is None: tensor_array = paddle.tensor.array_write( tensor, i=index) else: paddle.tensor.array_write( tensor, i=index, array=tensor_array) array1 = paddle.concat(tensor_array) array2 = paddle.concat(tensor_array[::-1]) return array1 + array2 * array2 net = Simple() func = paddle.jit.to_static(net.forward) inps1 = paddle.to_tensor( np.random.randn(2, 10), place=paddle.CUDAPinnedPlace(), stop_gradient=False) inps2 = paddle.to_tensor( np.random.randn(2, 10), place=paddle.CUDAPinnedPlace(), stop_gradient=False) self.assertTrue(inps1.place.is_cuda_pinned_place()) self.assertTrue(inps2.place.is_cuda_pinned_place()) result = func([inps1, inps2]) self.assertFalse(result.place.is_cuda_pinned_place())
def test_place_2(self): place = paddle.CPUPlace() data_place = place if core.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) data_place = paddle.CUDAPinnedPlace() paddle.disable_static(place) data = np.array([9], dtype="int64") data_tensor = paddle.to_tensor(data, place=data_place) result = data_tensor == 0 self.assertEqual((result.numpy() == np.array([False])).all(), True)
def func_setUp(self): self.empty = paddle.to_tensor(np.array([], dtype="int64"), place=paddle.CPUPlace()) data = np.random.randn(100, 50, 50).astype("float32") self.src = paddle.to_tensor(data, place=paddle.CUDAPinnedPlace()) self.dst = paddle.empty(shape=[100, 50, 50], dtype="float32") self.index = paddle.to_tensor(np.array([1, 3, 5, 7, 9], dtype="int64")).cpu() self.buffer = paddle.empty(shape=[50, 50, 50], dtype="float32").pin_memory() self.stream = cuda.Stream()
def prune_params(model, param_config, super_model_sd=None): for name, param in model.named_parameters(): t_value = param.value().get_tensor() value = np.array(t_value).astype("float32") if super_model_sd != None: super_t_value = super_model_sd[name].value().get_tensor() super_value = np.array(super_t_value).astype("float32") if param.name in param_config.keys(): if len(param_config[param.name]) > 1: in_exp = param_config[param.name][0] out_exp = param_config[param.name][1] in_chn = int(value.shape[0]) if in_exp == None else int( value.shape[0] * in_exp) out_chn = int(value.shape[1]) if out_exp == None else int( value.shape[1] * out_exp) prune_value = super_value[:in_chn, :out_chn, ...] \ if super_model_sd != None else value[:in_chn, :out_chn, ...] else: out_chn = int(value.shape[0]) if param_config[ param.name][0] == None else int( value.shape[0] * param_config[param.name][0]) prune_value = super_value[:out_chn, ...] \ if super_model_sd != None else value[:out_chn, ...] else: prune_value = super_value if super_model_sd != None else value p = t_value._place() if p.is_cpu_place(): place = paddle.CPUPlace() elif p.is_cuda_pinned_place(): place = paddle.CUDAPinnedPlace() else: place = paddle.CUDAPlace(p.gpu_device_id()) t_value.set(prune_value, place) if param.trainable: param.clear_gradient()
def train(args): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') use_data_parallel = args.use_data_parallel paddle.disable_static(paddle.CUDAPlace(0)) place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) \ if use_data_parallel else paddle.CUDAPlace(0) if use_data_parallel: paddle.distributed.init_parallel_env() video_model = TSN_ResNet(train_config) if use_data_parallel: video_model = paddle.DataParallel(video_model) pre_state_dict = paddle.load(args.pretrain) #if paddle.distributed.parallel.Env().local_rank == 0: video_model = init_model(video_model, pre_state_dict) optimizer = create_optimizer(train_config.TRAIN, video_model.parameters()) bs_denominator = 1 if args.use_gpu: # check number of GPUs gpus = os.getenv("CUDA_VISIBLE_DEVICES", "") if gpus == "": pass else: gpus = gpus.split(",") num_gpus = len(gpus) bs_denominator = num_gpus bs_train_single = int(train_config.TRAIN.batch_size / bs_denominator) bs_val_single = int(valid_config.VALID.batch_size / bs_denominator) train_dataset = TSN_UCF101_Dataset(train_config, 'train') val_dataset = TSN_UCF101_Dataset(valid_config, 'valid') train_sampler = DistributedBatchSampler( train_dataset, batch_size=bs_train_single, shuffle=train_config.TRAIN.use_shuffle, drop_last=True) train_loader = DataLoader(train_dataset, batch_sampler=train_sampler, places=place, num_workers=train_config.TRAIN.num_workers, return_list=True) val_sampler = DistributedBatchSampler(val_dataset, batch_size=bs_val_single) val_loader = DataLoader(val_dataset, batch_sampler=val_sampler, places=place, num_workers=valid_config.VALID.num_workers, return_list=True) # resume training the model if args.resume is not None: model_state, opt_state = paddle.load(args.resume) video_model.set_dict(model_state) optimizer.set_dict(opt_state) reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() for epoch in range(1, train_config.TRAIN.epoch + 1): epoch_start = time.time() video_model.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 batch_start = time.time() for batch_id, data in enumerate(train_loader): reader_cost_averager.record(time.time() - batch_start) imgs = paddle.to_tensor(data[0], place=paddle.CUDAPinnedPlace()) labels = paddle.to_tensor(data[1], place=paddle.CUDAPinnedPlace()) labels.stop_gradient = True outputs = video_model(imgs) loss = F.cross_entropy(input=outputs, label=labels, ignore_index=-1) avg_loss = paddle.mean(loss) acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1) acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5) dy_out = avg_loss.numpy()[0] if use_data_parallel: # (data_parallel step5/6) avg_loss = video_model.scale_loss(avg_loss) avg_loss.backward() video_model.apply_collective_grads() else: avg_loss.backward() optimizer.minimize(avg_loss) optimizer.step() optimizer.clear_grad() total_loss += dy_out total_acc1 += acc_top1.numpy()[0] total_acc5 += acc_top5.numpy()[0] total_sample += 1 batch_cost_averager.record(time.time() - batch_start, num_samples=bs_train_single) if batch_id % args.log_interval == 0: print( 'TRAIN Epoch: {}, iter: {}, loss={:.6f}, acc1 {:.6f}, acc5 {:.6f}, batch_cost: {:.5f} sec, reader_cost: {:.5f} sec, ips: {:.5f} samples/sec' .format(epoch, batch_id, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample, batch_cost_averager.get_average(), reader_cost_averager.get_average(), batch_cost_averager.get_ips_average())) batch_cost_averager.reset() reader_cost_averager.reset() batch_start = time.time() train_epoch_cost = time.time() - epoch_start print( 'TRAIN End, Epoch {}, avg_loss= {:.6f}, avg_acc1= {:.6f}, avg_acc5= {:.6f}, epoch_cost: {:.5f} sec' .format(epoch, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample, train_epoch_cost)) # save model's and optimizer's parameters which used for resuming the training stage save_parameters = (not use_data_parallel) or ( use_data_parallel and paddle.distributed.ParallelEnv().local_rank == 0) if save_parameters: model_path_pre = "_tsn" if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) model_path = os.path.join( args.checkpoint, "_" + model_path_pre + "_epoch{}".format(epoch)) paddle.save(video_model.state_dict(), model_path) paddle.save(optimizer.state_dict(), model_path) if args.validate: video_model.eval() val_acc = val(epoch, video_model, val_loader, valid_config, args) # save the best parameters in trainging stage if epoch == 1: best_acc = val_acc else: if val_acc > best_acc: best_acc = val_acc if paddle.distributed.ParallelEnv().local_rank == 0: if not os.path.isdir(args.weights): os.makedirs(args.weights) paddle.save(video_model.state_dict(), args.weights + "/final") else: if paddle.distributed.parallel.Env().local_rank == 0: if not os.path.isdir(args.weights): os.makedirs(args.weights) paddle.save(video_model.state_dict(), args.weights + "/final") logger.info('[TRAIN] training finished')
def test_api(self): a = paddle.ones([1024, 1024]) b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace()) self.assertEqual(b.place.__repr__(), "Place(gpu_pinned)") self.assertTrue(np.array_equal(a.numpy(), b.numpy()))