Esempio n. 1
0
    def restore(self, model, opt=None):
        for name, sub_layer in model.named_sublayers(include_self=True):
            for param in sub_layer.parameters(include_sublayers=False):
                # restore optimizer accumulators from layer buffer
                self._restore_opt(param.name, sub_layer, opt)
                backup_name = "_".join(
                    [param.name.replace(".", "_"), "backup"])
                if backup_name in sub_layer._buffers:
                    _logger.debug("Restore values of variable: {}".format(
                        param.name))
                    t_value = param.value().get_tensor()
                    t_backup = sub_layer._buffers[backup_name].value(
                    ).get_tensor()

                    p = t_value._place()
                    if p.is_cpu_place():
                        place = paddle.CPUPlace()
                    elif p.is_cuda_pinned_place():
                        place = paddle.CUDAPinnedPlace()
                    else:
                        p = core.Place()
                        p.set_place(t_value._place())
                        place = paddle.CUDAPlace(p.gpu_device_id())

                    t_value.set(np.array(t_backup).astype("float32"), place)
                    if "_origin_groups" in sub_layer.__dict__:
                        sub_layer._groups = sub_layer._origin_groups
                    del sub_layer._buffers[backup_name]
Esempio n. 2
0
    def _prune_opt(self, param_name, dims, bool_mask, opt):
        if opt is None:
            return
        for k, v in opt._accumulators.items():
            var_tmp = v.get(param_name)
            #NOTE: var_tmp.shape == [1] is used to skip variables like beta1_pow_acc in Adam optimizer. Its shape is [1] and there's no need to prune this one-value variable.
            if var_tmp is None or var_tmp.shape == [1]:
                if var_tmp is not None: print(var_tmp.name, var_tmp.shape)
                continue
            t_value = var_tmp.value().get_tensor()
            value = np.array(t_value).astype("float32")

            pruned_value = np.apply_along_axis(lambda data: data[bool_mask],
                                               dims, value)

            p = t_value._place()
            if p.is_cpu_place():
                place = paddle.CPUPlace()
            elif p.is_cuda_pinned_place():
                place = paddle.CUDAPinnedPlace()
            else:
                p = core.Place()
                p.set_place(t_value._place())
                place = paddle.CUDAPlace(p.gpu_device_id())

            t_value.set(pruned_value, place)
Esempio n. 3
0
    def lazy_apply(self, model):
        for name, sub_layer in model.named_sublayers():
            for param in sub_layer.parameters(include_sublayers=False):
                if param.name in self._masks:
                    for _mask in self._masks[param.name]:
                        dims = _mask.dims
                        mask = _mask.mask
                        t_value = param.value().get_tensor()
                        value = np.array(t_value).astype("float32")
                        # The name of buffer can not contains "."
                        backup_name = param.name.replace(".", "_") + "_backup"
                        if backup_name not in sub_layer._buffers:
                            sub_layer.register_buffer(backup_name,
                                                      paddle.to_tensor(value))
                            _logger.debug(
                                "Backup values of {} into buffers.".format(
                                    param.name))
                        expand_mask_shape = [1] * len(value.shape)
                        expand_mask_shape[dims] = value.shape[dims]
                        _logger.debug("Expanded mask shape: {}".format(
                            expand_mask_shape))
                        expand_mask = mask.reshape(expand_mask_shape).astype(
                            "float32")

                        p = t_value._place()
                        if p.is_cpu_place():
                            place = paddle.CPUPlace()
                        elif p.is_cuda_pinned_place():
                            place = paddle.CUDAPinnedPlace()
                        else:
                            p = core.Place()
                            p.set_place(t_value._place())
                            place = paddle.CUDAPlace(p.gpu_device_id())

                        t_value.set(value * expand_mask, place)
Esempio n. 4
0
    def restore(self, model):
        for name, sub_layer in model.named_sublayers():
            for param in sub_layer.parameters(include_sublayers=False):
                backup_name = "_".join(
                    [param.name.replace(".", "_"), "backup"])
                if backup_name in sub_layer._buffers:
                    _logger.debug("Restore values of variable: {}".format(
                        param.name))
                    t_value = param.value().get_tensor()
                    t_backup = sub_layer._buffers[backup_name].value(
                    ).get_tensor()

                    p = t_value._place()
                    if p.is_cpu_place():
                        place = paddle.CPUPlace()
                    elif p.is_cuda_pinned_place():
                        place = paddle.CUDAPinnedPlace()
                    else:
                        p = core.Place()
                        p.set_place(t_value._place())
                        place = paddle.CUDAPlace(p.gpu_device_id())

                    t_value.set(np.array(t_backup).astype("float32"), place)

                    if isinstance(sub_layer, paddle.nn.layer.conv.Conv2D):
                        if sub_layer._groups > 1:
                            _logger.debug(
                                "Update groups of conv form {} to {}".format(
                                    sub_layer._groups,
                                    t_value.shape()[0]))
                            sub_layer._groups = t_value.shape()[0]
                    del sub_layer._buffers[backup_name]
Esempio n. 5
0
    def _restore_opt(self, param_name, sub_layer, opt):
        if opt is None:
            return
        for k, v in opt._accumulators.items():
            var_tmp = v.get(param_name)
            if var_tmp is None: continue
            backup_name = var_tmp.name.replace(".", "_") + "_backup"
            if backup_name in sub_layer._buffers:
                _logger.debug("Restore values of variable: {}".format(
                    var_tmp.name))
                t_value = var_tmp.value().get_tensor()
                t_backup = sub_layer._buffers[backup_name].value().get_tensor()

                p = t_value._place()
                if p.is_cpu_place():
                    place = paddle.CPUPlace()
                elif p.is_cuda_pinned_place():
                    place = paddle.CUDAPinnedPlace()
                else:
                    p = core.Place()
                    p.set_place(t_value._place())
                    place = paddle.CUDAPlace(p.gpu_device_id())

                t_value.set(np.array(t_backup).astype("float32"), place)
                del sub_layer._buffers[backup_name]
Esempio n. 6
0
    def imperative_apply(self, model, opt=None):
        """
        Pruning values of variable imperatively. It is valid when pruning
        on one dimension.
        """

        for name, sub_layer in model.named_sublayers(include_self=True):
            for param in sub_layer.parameters(include_sublayers=False):
                if param.name in self._masks:
                    for _mask in self._masks[param.name]:
                        dims = _mask.dims
                        assert (isinstance(dims, int))
                        mask = _mask.mask
                        bool_mask = np.array(mask).astype(bool)
                        t_value = param.value().get_tensor()
                        value = np.array(t_value).astype("float32")
                        groups = _mask._op.attr('groups')
                        if dims == 1 and groups is not None and groups > 1 and len(
                                value.shape) == 4:
                            filter_size = value.shape[1]
                            except_num = np.sum(bool_mask)
                            assert (except_num % filter_size == 0)
                            new_groups = int(except_num / filter_size)
                            sub_layer._origin_groups = sub_layer._groups
                            sub_layer._groups = new_groups
                            _logger.info(
                                "change groups from {} to {} for {}.".format(
                                    groups, new_groups, param.name))
                            continue

                        # The name of buffer can not contains "."
                        backup_name = param.name.replace(".", "_") + "_backup"
                        if backup_name not in sub_layer._buffers:
                            sub_layer.register_buffer(backup_name,
                                                      paddle.to_tensor(value))
                            _logger.debug(
                                "Backup values of {} into buffers.".format(
                                    param.name))
                        # save optimizer accumulators into layer buffer
                        self._buffer_opt(param.name, sub_layer, opt)

                        pruned_value = np.apply_along_axis(
                            lambda data: data[bool_mask], dims, value)
                        self._prune_opt(param.name, dims, bool_mask, opt)

                        p = t_value._place()
                        if p.is_cpu_place():
                            place = paddle.CPUPlace()
                        elif p.is_cuda_pinned_place():
                            place = paddle.CUDAPinnedPlace()
                        else:
                            p = core.Place()
                            p.set_place(t_value._place())
                            place = paddle.CUDAPlace(p.gpu_device_id())
                        t_value.set(pruned_value, place)

                    # for training
                    if param.trainable:
                        param.clear_gradient()
Esempio n. 7
0
 def test_cuda_pinned_place(self):
     with paddle.fluid.dygraph.guard():
         x = paddle.to_tensor(
             np.random.randn(2, 10), place=paddle.CUDAPinnedPlace())
         self.assertTrue(x.place.is_cuda_pinned_place())
         y = x[:, ::2]
         self.assertFalse(x.place.is_cuda_pinned_place())
         self.assertFalse(y.place.is_cuda_pinned_place())
Esempio n. 8
0
 def test_skip_data_transform(self):
     paddle.disable_static()
     with _test_eager_guard():
         x = paddle.to_tensor([1., 2., 3., 4.],
                              place=paddle.CUDAPinnedPlace())
         out = paddle.full_like(x, 1.)
         self.assertTrue(
             (out.numpy() == np.ones([4]).astype(np.float32)).all(), True)
     paddle.enable_static()
Esempio n. 9
0
    def imperative_apply(self, model):
        """
        Pruning values of variable imperatively. It is valid when pruning
        on one dimension.
        """

        for name, sub_layer in model.named_sublayers():
            for param in sub_layer.parameters(include_sublayers=False):
                if param.name in self._masks:
                    for _mask in self._masks[param.name]:
                        dims = _mask.dims
                        mask = _mask.mask
                        assert len(
                            dims
                        ) == 1, "Imperative mode only support for pruning on one dimension, but get dims {} when pruning parameter {}".format(
                            dims, param.name)
                        t_value = param.value().get_tensor()
                        value = np.array(t_value).astype("float32")
                        # The name of buffer can not contains "."
                        backup_name = param.name.replace(".", "_") + "_backup"
                        if backup_name not in sub_layer._buffers:
                            sub_layer.register_buffer(backup_name,
                                                      paddle.to_tensor(value))
                            _logger.debug(
                                "Backup values of {} into buffers.".format(
                                    param.name))
                        bool_mask = np.array(mask).astype(bool)
                        pruned_value = np.apply_along_axis(
                            lambda data: data[bool_mask], dims[0], value)
                        p = t_value._place()
                        if p.is_cpu_place():
                            place = paddle.CPUPlace()
                        elif p.is_cuda_pinned_place():
                            place = paddle.CUDAPinnedPlace()
                        else:
                            p = core.Place()
                            p.set_place(t_value._place())
                            place = paddle.CUDAPlace(p.gpu_device_id())

                        t_value.set(pruned_value, place)
                        if isinstance(sub_layer, paddle.nn.layer.conv.Conv2D
                                      ) and sub_layer._groups > 1 and len(
                                          param.shape) == 4:
                            assert param.shape[
                                1] == 1, "It just supports depthwise conv2d when groups > 1."
                            new_groups = int(bool_mask.sum() *
                                             sub_layer._groups /
                                             len(bool_mask))
                            _logger.debug(
                                "Update groups of depthwise conv2d form {} to {}"
                                .format(sub_layer._groups, new_groups))
                            sub_layer._origin_groups = sub_layer._groups
                            sub_layer._groups = new_groups

                    # for training
                    if param.trainable:
                        param.clear_gradient()
Esempio n. 10
0
    def test_strided_slice_tensor_array_cuda_pinned_place(self):
        if paddle.device.is_compiled_with_cuda():
            with paddle.fluid.dygraph.guard():

                class Simple(paddle.nn.Layer):
                    def __init__(self):
                        super(Simple, self).__init__()

                    def forward(self, inps):
                        tensor_array = None
                        for i, tensor in enumerate(inps):
                            index = paddle.full(
                                shape=[1], dtype='int64', fill_value=i)
                            if tensor_array is None:
                                tensor_array = paddle.tensor.array_write(
                                    tensor, i=index)
                            else:
                                paddle.tensor.array_write(
                                    tensor, i=index, array=tensor_array)

                        array1 = paddle.concat(tensor_array)
                        array2 = paddle.concat(tensor_array[::-1])
                        return array1 + array2 * array2

                net = Simple()
                func = paddle.jit.to_static(net.forward)

                inps1 = paddle.to_tensor(
                    np.random.randn(2, 10),
                    place=paddle.CUDAPinnedPlace(),
                    stop_gradient=False)
                inps2 = paddle.to_tensor(
                    np.random.randn(2, 10),
                    place=paddle.CUDAPinnedPlace(),
                    stop_gradient=False)

                self.assertTrue(inps1.place.is_cuda_pinned_place())
                self.assertTrue(inps2.place.is_cuda_pinned_place())

                result = func([inps1, inps2])

                self.assertFalse(result.place.is_cuda_pinned_place())
Esempio n. 11
0
 def test_place_2(self):
     place = paddle.CPUPlace()
     data_place = place
     if core.is_compiled_with_cuda():
         place = paddle.CUDAPlace(0)
         data_place = paddle.CUDAPinnedPlace()
     paddle.disable_static(place)
     data = np.array([9], dtype="int64")
     data_tensor = paddle.to_tensor(data, place=data_place)
     result = data_tensor == 0
     self.assertEqual((result.numpy() == np.array([False])).all(), True)
Esempio n. 12
0
 def func_setUp(self):
     self.empty = paddle.to_tensor(np.array([], dtype="int64"),
                                   place=paddle.CPUPlace())
     data = np.random.randn(100, 50, 50).astype("float32")
     self.src = paddle.to_tensor(data, place=paddle.CUDAPinnedPlace())
     self.dst = paddle.empty(shape=[100, 50, 50], dtype="float32")
     self.index = paddle.to_tensor(np.array([1, 3, 5, 7, 9],
                                            dtype="int64")).cpu()
     self.buffer = paddle.empty(shape=[50, 50, 50],
                                dtype="float32").pin_memory()
     self.stream = cuda.Stream()
Esempio n. 13
0
def prune_params(model, param_config, super_model_sd=None):
    for name, param in model.named_parameters():
        t_value = param.value().get_tensor()
        value = np.array(t_value).astype("float32")

        if super_model_sd != None:
            super_t_value = super_model_sd[name].value().get_tensor()
            super_value = np.array(super_t_value).astype("float32")

        if param.name in param_config.keys():
            if len(param_config[param.name]) > 1:
                in_exp = param_config[param.name][0]
                out_exp = param_config[param.name][1]
                in_chn = int(value.shape[0]) if in_exp == None else int(
                    value.shape[0] * in_exp)
                out_chn = int(value.shape[1]) if out_exp == None else int(
                    value.shape[1] * out_exp)
                prune_value = super_value[:in_chn, :out_chn, ...] \
                                 if super_model_sd != None else value[:in_chn, :out_chn, ...]
            else:
                out_chn = int(value.shape[0]) if param_config[
                    param.name][0] == None else int(
                        value.shape[0] * param_config[param.name][0])
                prune_value = super_value[:out_chn, ...] \
                                 if super_model_sd != None else value[:out_chn, ...]
        else:
            prune_value = super_value if super_model_sd != None else value

        p = t_value._place()
        if p.is_cpu_place():
            place = paddle.CPUPlace()
        elif p.is_cuda_pinned_place():
            place = paddle.CUDAPinnedPlace()
        else:
            place = paddle.CUDAPlace(p.gpu_device_id())
        t_value.set(prune_value, place)
        if param.trainable:
            param.clear_gradient()
Esempio n. 14
0
def train(args):
    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    valid_config = merge_configs(config, 'valid', vars(args))
    print_configs(train_config, 'Train')
    use_data_parallel = args.use_data_parallel

    paddle.disable_static(paddle.CUDAPlace(0))

    place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) \
        if use_data_parallel else paddle.CUDAPlace(0)
    if use_data_parallel:
        paddle.distributed.init_parallel_env()

    video_model = TSN_ResNet(train_config)
    if use_data_parallel:
        video_model = paddle.DataParallel(video_model)

    pre_state_dict = paddle.load(args.pretrain)
    #if paddle.distributed.parallel.Env().local_rank == 0:
    video_model = init_model(video_model, pre_state_dict)

    optimizer = create_optimizer(train_config.TRAIN, video_model.parameters())

    bs_denominator = 1
    if args.use_gpu:
        # check number of GPUs
        gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
        if gpus == "":
            pass
        else:
            gpus = gpus.split(",")
            num_gpus = len(gpus)
        bs_denominator = num_gpus
    bs_train_single = int(train_config.TRAIN.batch_size / bs_denominator)
    bs_val_single = int(valid_config.VALID.batch_size / bs_denominator)

    train_dataset = TSN_UCF101_Dataset(train_config, 'train')
    val_dataset = TSN_UCF101_Dataset(valid_config, 'valid')
    train_sampler = DistributedBatchSampler(
        train_dataset,
        batch_size=bs_train_single,
        shuffle=train_config.TRAIN.use_shuffle,
        drop_last=True)
    train_loader = DataLoader(train_dataset,
                              batch_sampler=train_sampler,
                              places=place,
                              num_workers=train_config.TRAIN.num_workers,
                              return_list=True)
    val_sampler = DistributedBatchSampler(val_dataset,
                                          batch_size=bs_val_single)
    val_loader = DataLoader(val_dataset,
                            batch_sampler=val_sampler,
                            places=place,
                            num_workers=valid_config.VALID.num_workers,
                            return_list=True)

    # resume training the model
    if args.resume is not None:
        model_state, opt_state = paddle.load(args.resume)
        video_model.set_dict(model_state)
        optimizer.set_dict(opt_state)

    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    for epoch in range(1, train_config.TRAIN.epoch + 1):
        epoch_start = time.time()

        video_model.train()
        total_loss = 0.0
        total_acc1 = 0.0
        total_acc5 = 0.0
        total_sample = 0

        batch_start = time.time()
        for batch_id, data in enumerate(train_loader):
            reader_cost_averager.record(time.time() - batch_start)

            imgs = paddle.to_tensor(data[0], place=paddle.CUDAPinnedPlace())
            labels = paddle.to_tensor(data[1], place=paddle.CUDAPinnedPlace())
            labels.stop_gradient = True
            outputs = video_model(imgs)

            loss = F.cross_entropy(input=outputs,
                                   label=labels,
                                   ignore_index=-1)
            avg_loss = paddle.mean(loss)

            acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
            acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
            dy_out = avg_loss.numpy()[0]

            if use_data_parallel:
                # (data_parallel step5/6)
                avg_loss = video_model.scale_loss(avg_loss)
                avg_loss.backward()
                video_model.apply_collective_grads()
            else:
                avg_loss.backward()

            optimizer.minimize(avg_loss)
            optimizer.step()
            optimizer.clear_grad()

            total_loss += dy_out
            total_acc1 += acc_top1.numpy()[0]
            total_acc5 += acc_top5.numpy()[0]
            total_sample += 1

            batch_cost_averager.record(time.time() - batch_start,
                                       num_samples=bs_train_single)
            if batch_id % args.log_interval == 0:
                print(
                    'TRAIN Epoch: {}, iter: {}, loss={:.6f}, acc1 {:.6f}, acc5 {:.6f}, batch_cost: {:.5f} sec, reader_cost: {:.5f} sec, ips: {:.5f} samples/sec'
                    .format(epoch, batch_id, total_loss / total_sample,
                            total_acc1 / total_sample,
                            total_acc5 / total_sample,
                            batch_cost_averager.get_average(),
                            reader_cost_averager.get_average(),
                            batch_cost_averager.get_ips_average()))
                batch_cost_averager.reset()
                reader_cost_averager.reset()

            batch_start = time.time()

        train_epoch_cost = time.time() - epoch_start
        print(
            'TRAIN End, Epoch {}, avg_loss= {:.6f}, avg_acc1= {:.6f}, avg_acc5= {:.6f}, epoch_cost: {:.5f} sec'
            .format(epoch, total_loss / total_sample,
                    total_acc1 / total_sample, total_acc5 / total_sample,
                    train_epoch_cost))

        # save model's and optimizer's parameters which used for resuming the training stage
        save_parameters = (not use_data_parallel) or (
            use_data_parallel
            and paddle.distributed.ParallelEnv().local_rank == 0)
        if save_parameters:
            model_path_pre = "_tsn"
            if not os.path.isdir(args.checkpoint):
                os.makedirs(args.checkpoint)
            model_path = os.path.join(
                args.checkpoint,
                "_" + model_path_pre + "_epoch{}".format(epoch))
            paddle.save(video_model.state_dict(), model_path)
            paddle.save(optimizer.state_dict(), model_path)

        if args.validate:
            video_model.eval()
            val_acc = val(epoch, video_model, val_loader, valid_config, args)
            # save the best parameters in trainging stage
            if epoch == 1:
                best_acc = val_acc
            else:
                if val_acc > best_acc:
                    best_acc = val_acc
                    if paddle.distributed.ParallelEnv().local_rank == 0:
                        if not os.path.isdir(args.weights):
                            os.makedirs(args.weights)
                        paddle.save(video_model.state_dict(),
                                    args.weights + "/final")
        else:
            if paddle.distributed.parallel.Env().local_rank == 0:
                if not os.path.isdir(args.weights):
                    os.makedirs(args.weights)
                paddle.save(video_model.state_dict(), args.weights + "/final")

    logger.info('[TRAIN] training finished')
Esempio n. 15
0
 def test_api(self):
     a = paddle.ones([1024, 1024])
     b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace())
     self.assertEqual(b.place.__repr__(), "Place(gpu_pinned)")
     self.assertTrue(np.array_equal(a.numpy(), b.numpy()))