Example #1
0
    def restore(self, model, opt=None):
        for name, sub_layer in model.named_sublayers(include_self=True):
            for param in sub_layer.parameters(include_sublayers=False):
                # restore optimizer accumulators from layer buffer
                self._restore_opt(param.name, sub_layer, opt)
                backup_name = "_".join(
                    [param.name.replace(".", "_"), "backup"])
                if backup_name in sub_layer._buffers:
                    _logger.debug("Restore values of variable: {}".format(
                        param.name))
                    t_value = param.value().get_tensor()
                    t_backup = sub_layer._buffers[backup_name].value(
                    ).get_tensor()

                    p = t_value._place()
                    if p.is_cpu_place():
                        place = paddle.CPUPlace()
                    elif p.is_cuda_pinned_place():
                        place = paddle.CUDAPinnedPlace()
                    else:
                        p = core.Place()
                        p.set_place(t_value._place())
                        place = paddle.CUDAPlace(p.gpu_device_id())

                    t_value.set(np.array(t_backup).astype("float32"), place)
                    if "_origin_groups" in sub_layer.__dict__:
                        sub_layer._groups = sub_layer._origin_groups
                    del sub_layer._buffers[backup_name]
Example #2
0
    def _prune_opt(self, param_name, dims, bool_mask, opt):
        if opt is None:
            return
        for k, v in opt._accumulators.items():
            var_tmp = v.get(param_name)
            #NOTE: var_tmp.shape == [1] is used to skip variables like beta1_pow_acc in Adam optimizer. Its shape is [1] and there's no need to prune this one-value variable.
            if var_tmp is None or var_tmp.shape == [1]:
                if var_tmp is not None: print(var_tmp.name, var_tmp.shape)
                continue
            t_value = var_tmp.value().get_tensor()
            value = np.array(t_value).astype("float32")

            pruned_value = np.apply_along_axis(lambda data: data[bool_mask],
                                               dims, value)

            p = t_value._place()
            if p.is_cpu_place():
                place = paddle.CPUPlace()
            elif p.is_cuda_pinned_place():
                place = paddle.CUDAPinnedPlace()
            else:
                p = core.Place()
                p.set_place(t_value._place())
                place = paddle.CUDAPlace(p.gpu_device_id())

            t_value.set(pruned_value, place)
Example #3
0
    def lazy_apply(self, model):
        for name, sub_layer in model.named_sublayers():
            for param in sub_layer.parameters(include_sublayers=False):
                if param.name in self._masks:
                    for _mask in self._masks[param.name]:
                        dims = _mask.dims
                        mask = _mask.mask
                        t_value = param.value().get_tensor()
                        value = np.array(t_value).astype("float32")
                        # The name of buffer can not contains "."
                        backup_name = param.name.replace(".", "_") + "_backup"
                        if backup_name not in sub_layer._buffers:
                            sub_layer.register_buffer(backup_name,
                                                      paddle.to_tensor(value))
                            _logger.debug(
                                "Backup values of {} into buffers.".format(
                                    param.name))
                        expand_mask_shape = [1] * len(value.shape)
                        expand_mask_shape[dims] = value.shape[dims]
                        _logger.debug("Expanded mask shape: {}".format(
                            expand_mask_shape))
                        expand_mask = mask.reshape(expand_mask_shape).astype(
                            "float32")

                        p = t_value._place()
                        if p.is_cpu_place():
                            place = paddle.CPUPlace()
                        elif p.is_cuda_pinned_place():
                            place = paddle.CUDAPinnedPlace()
                        else:
                            p = core.Place()
                            p.set_place(t_value._place())
                            place = paddle.CUDAPlace(p.gpu_device_id())

                        t_value.set(value * expand_mask, place)
Example #4
0
    def restore(self, model):
        for name, sub_layer in model.named_sublayers():
            for param in sub_layer.parameters(include_sublayers=False):
                backup_name = "_".join(
                    [param.name.replace(".", "_"), "backup"])
                if backup_name in sub_layer._buffers:
                    _logger.debug("Restore values of variable: {}".format(
                        param.name))
                    t_value = param.value().get_tensor()
                    t_backup = sub_layer._buffers[backup_name].value(
                    ).get_tensor()

                    p = t_value._place()
                    if p.is_cpu_place():
                        place = paddle.CPUPlace()
                    elif p.is_cuda_pinned_place():
                        place = paddle.CUDAPinnedPlace()
                    else:
                        p = core.Place()
                        p.set_place(t_value._place())
                        place = paddle.CUDAPlace(p.gpu_device_id())

                    t_value.set(np.array(t_backup).astype("float32"), place)

                    if isinstance(sub_layer, paddle.nn.layer.conv.Conv2D):
                        if sub_layer._groups > 1:
                            _logger.debug(
                                "Update groups of conv form {} to {}".format(
                                    sub_layer._groups,
                                    t_value.shape()[0]))
                            sub_layer._groups = t_value.shape()[0]
                    del sub_layer._buffers[backup_name]
Example #5
0
    def _restore_opt(self, param_name, sub_layer, opt):
        if opt is None:
            return
        for k, v in opt._accumulators.items():
            var_tmp = v.get(param_name)
            if var_tmp is None: continue
            backup_name = var_tmp.name.replace(".", "_") + "_backup"
            if backup_name in sub_layer._buffers:
                _logger.debug("Restore values of variable: {}".format(
                    var_tmp.name))
                t_value = var_tmp.value().get_tensor()
                t_backup = sub_layer._buffers[backup_name].value().get_tensor()

                p = t_value._place()
                if p.is_cpu_place():
                    place = paddle.CPUPlace()
                elif p.is_cuda_pinned_place():
                    place = paddle.CUDAPinnedPlace()
                else:
                    p = core.Place()
                    p.set_place(t_value._place())
                    place = paddle.CUDAPlace(p.gpu_device_id())

                t_value.set(np.array(t_backup).astype("float32"), place)
                del sub_layer._buffers[backup_name]
    def test_standalone_executor_statistics(self):
        if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
            return

        paddle.seed(2020)
        main_program, startup_program, fetch_list = build_program()
        fetch_list = [x.name for x in fetch_list]

        p = core.Place()
        p.set_place(self.place)
        executor = StandaloneExecutor(p, startup_program.desc,
                                      main_program.desc, core.Scope())

        helper_profiler = profiler.Profiler(
            targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
        helper_profiler.start()
        for i in range(self.iter_n):
            executor.run({}, fetch_list)
            helper_profiler.step()
        helper_profiler.stop()

        perfstat_filepath = os.environ[
            'FLAGS_static_executor_perfstat_filepath']
        self.assertTrue(os.path.exists(perfstat_filepath))
        with open(perfstat_filepath, 'r') as load_f:
            stat_res = json.load(load_f)
            self.assertTrue(len(stat_res) > 0)

        os.remove(perfstat_filepath)
        shutil.rmtree('./profiler_log')
Example #7
0
    def imperative_apply(self, model, opt=None):
        """
        Pruning values of variable imperatively. It is valid when pruning
        on one dimension.
        """

        for name, sub_layer in model.named_sublayers(include_self=True):
            for param in sub_layer.parameters(include_sublayers=False):
                if param.name in self._masks:
                    for _mask in self._masks[param.name]:
                        dims = _mask.dims
                        assert (isinstance(dims, int))
                        mask = _mask.mask
                        bool_mask = np.array(mask).astype(bool)
                        t_value = param.value().get_tensor()
                        value = np.array(t_value).astype("float32")
                        groups = _mask._op.attr('groups')
                        if dims == 1 and groups is not None and groups > 1 and len(
                                value.shape) == 4:
                            filter_size = value.shape[1]
                            except_num = np.sum(bool_mask)
                            assert (except_num % filter_size == 0)
                            new_groups = int(except_num / filter_size)
                            sub_layer._origin_groups = sub_layer._groups
                            sub_layer._groups = new_groups
                            _logger.info(
                                "change groups from {} to {} for {}.".format(
                                    groups, new_groups, param.name))
                            continue

                        # The name of buffer can not contains "."
                        backup_name = param.name.replace(".", "_") + "_backup"
                        if backup_name not in sub_layer._buffers:
                            sub_layer.register_buffer(backup_name,
                                                      paddle.to_tensor(value))
                            _logger.debug(
                                "Backup values of {} into buffers.".format(
                                    param.name))
                        # save optimizer accumulators into layer buffer
                        self._buffer_opt(param.name, sub_layer, opt)

                        pruned_value = np.apply_along_axis(
                            lambda data: data[bool_mask], dims, value)
                        self._prune_opt(param.name, dims, bool_mask, opt)

                        p = t_value._place()
                        if p.is_cpu_place():
                            place = paddle.CPUPlace()
                        elif p.is_cuda_pinned_place():
                            place = paddle.CUDAPinnedPlace()
                        else:
                            p = core.Place()
                            p.set_place(t_value._place())
                            place = paddle.CUDAPlace(p.gpu_device_id())
                        t_value.set(pruned_value, place)

                    # for training
                    if param.trainable:
                        param.clear_gradient()
Example #8
0
    def imperative_apply(self, model):
        """
        Pruning values of variable imperatively. It is valid when pruning
        on one dimension.
        """

        for name, sub_layer in model.named_sublayers():
            for param in sub_layer.parameters(include_sublayers=False):
                if param.name in self._masks:
                    for _mask in self._masks[param.name]:
                        dims = _mask.dims
                        mask = _mask.mask
                        assert len(
                            dims
                        ) == 1, "Imperative mode only support for pruning on one dimension, but get dims {} when pruning parameter {}".format(
                            dims, param.name)
                        t_value = param.value().get_tensor()
                        value = np.array(t_value).astype("float32")
                        # The name of buffer can not contains "."
                        backup_name = param.name.replace(".", "_") + "_backup"
                        if backup_name not in sub_layer._buffers:
                            sub_layer.register_buffer(backup_name,
                                                      paddle.to_tensor(value))
                            _logger.debug(
                                "Backup values of {} into buffers.".format(
                                    param.name))
                        bool_mask = np.array(mask).astype(bool)
                        pruned_value = np.apply_along_axis(
                            lambda data: data[bool_mask], dims[0], value)
                        p = t_value._place()
                        if p.is_cpu_place():
                            place = paddle.CPUPlace()
                        elif p.is_cuda_pinned_place():
                            place = paddle.CUDAPinnedPlace()
                        else:
                            p = core.Place()
                            p.set_place(t_value._place())
                            place = paddle.CUDAPlace(p.gpu_device_id())

                        t_value.set(pruned_value, place)
                        if isinstance(sub_layer, paddle.nn.layer.conv.Conv2D
                                      ) and sub_layer._groups > 1 and len(
                                          param.shape) == 4:
                            assert param.shape[
                                1] == 1, "It just supports depthwise conv2d when groups > 1."
                            new_groups = int(bool_mask.sum() *
                                             sub_layer._groups /
                                             len(bool_mask))
                            _logger.debug(
                                "Update groups of depthwise conv2d form {} to {}"
                                .format(sub_layer._groups, new_groups))
                            sub_layer._origin_groups = sub_layer._groups
                            sub_layer._groups = new_groups

                    # for training
                    if param.trainable:
                        param.clear_gradient()
Example #9
0
 def _set_var(var, ndarray):
     t = global_scope().find_var(var.name).get_tensor()
     p = t._place()
     if p.is_cpu_place():
         place = core.CPUPlace()
     elif p.is_cuda_pinned_place():
         place = core.CUDAPinnedPlace()
     else:
         p = core.Place()
         p.set_place(t._place())
         place = core.CUDAPlace(p.gpu_device_id())
     t.set(ndarray, place)
Example #10
0
    def run_new_executor(self):
        paddle.seed(2020)
        main_program, startup_program, fetch_list = build_program()
        fetch_list = [x.name for x in fetch_list]

        p = core.Place()
        p.set_place(self.place)
        inter_core = StandaloneExecutor(p, startup_program.desc,
                                        main_program.desc, core.Scope())
        outs = []
        for i in range(self.iter_n):
            outs.append(
                np.array(inter_core.run({}, fetch_list)._move_to_list()[0]))
        return outs
Example #11
0
    def _construct_grad_feed_map_from_forward(self, place, fwd_res,
                                              grad_op_desc, op_grad_to_var):
        """Generate grad_feed_map for grad_program.

        since we don`t really check gradient accuracy, but check the consistency when using and not using inplace,
        we use fwd outs (also inputs sometimes) to construct grad inputs.

        Args:
            place (CPUPlace | CUDAPlace): The place where the op runs. 
            fwd_res (tuple): The outputs of its forward op, in the same form as returns of _calc_outputs() when for_inplace_test is True.
                i.e., tuple(fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc)
            grad_op_desc (OpDesc): The OpDesc of grad op.
            op_grad_to_var (dict): The relation of variables in grad op and its fwd_op. 

        Returns:
            grad_feed_map (dict): The feed_map of grad_op.
        """
        fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc = fwd_res
        p = core.Place()
        p.set_place(place)
        grad_feed_map = {}
        for arg in grad_op_desc.input_arg_names():
            if arg in fwd_feed_map.keys():
                grad_feed_map[arg] = fwd_feed_map[arg]._copy(p)
            else:
                fwd_var_name = op_grad_to_var.get(arg, None)
                if fwd_var_name is None:
                    fwd_var_name = arg

                for i, out_name in enumerate(fwd_fetch_list):
                    if out_name == fwd_var_name:
                        # don't feed variables whose tensors hold no buffer (shape contains 0 like shape = [0,2,5] and holder_ is NULL), like XShape in reshape2 op.
                        # get them from global_scope directly since we have set them persistable in fwd execution
                        if 0 in fwd_program.global_block().var(out_name).shape:
                            continue
                        else:
                            grad_feed_map[arg] = fwd_outs[i]._copy(p)
        return grad_feed_map
Example #12
0
def grad(outputs,
         inputs,
         grad_outputs=None,
         retain_graph=None,
         create_graph=False,
         only_inputs=True,
         allow_unused=False,
         no_grad_vars=None):
    ''' 
    .. note::
        **This API is ONLY available in imperative mode.**

    This API computes the sum of gradients of `outputs` with respect to each `inputs` .

    Parameters:
        outputs (Tensor|list(Tensor)|tuple(Tensor)): the output Tensor or 
            Tensor list/tuple of the graph to compute gradients.
        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
            Tensor list/tuple of the graph to compute gradients. The returned
            values of this API are the gradients of `inputs` . 
        grad_outputs (Tensor|list(Tensor|None)|tuple(Tensor|None), optional): 
            initial gradient values of `outputs` . If `grad_outputs` is None, 
            the initial gradient values of `outputs` would be Tensors filled with 1; 
            if `grad_outputs` is not None, it must have the same length as `outputs` , 
            and in this case, the initial gradient value of the i-th `outputs` would
            be: (1) a Tensor filled with 1 when the i-th element of `grad_outputs` 
            is None; (2) the i-th element of `grad_outputs` when the i-th element of
            `grad_outputs` is a Tensor. Default None.
        retain_graph (bool, optional): whether to retain the forward graph which 
            is used to calculate the gradient. When it is True, the graph would 
            be retained, in which way users can calculate backward twice for the 
            same graph. When it is False, the graph would be freed. Default None,
            which means it is equal to `create_graph` . 
        create_graph (bool, optional): whether to create the gradient graphs of
            the computing process. When it is True, higher order derivatives are
            supported to compute; when it is False, the gradient graphs of the
            computing process would be discarded. Default False.
        only_inputs (bool, optional): whether to only compute the gradients of
            `inputs` . If it is False, the gradients of all remaining leaf 
            Tensors in the graph would be also computed and accumulated. 
            If it is True, only the gradients of `inputs` would be computed.
            Default True. only_inputs=False is under development, and it is
            not supported yet.    
        allow_unused (bool, optional): whether to raise error or return None if some 
            Tensors of `inputs` are unreachable in the graph. If some Tensors of 
            `inputs` are unreachable in the graph (i.e., their gradients are None),  
            error would be raised if allow_unused=False, or None would be returned as
            their gradients if allow_unused=True. Default False.
        no_grad_vars (Tensor|list(Tensor)|tuple(Tensor)|set(Tensor), optional): 
            the Tensors whose gradients are not needed to compute. Default None.

    Returns:
        list: a list of Tensors, whose length is the same as the Tensor number 
        inside `inputs`, and the i-th returned Tensor is the sum of gradients of 
        `outputs` with respect to the i-th `inputs`.

    Examples 1:
        .. code-block:: python

            import paddle

            def test_dygraph_grad(create_graph):
                x = paddle.ones(shape=[1], dtype='float32')
                x.stop_gradient = False
                y = x * x

                # Since y = x * x, dx = 2 * x
                dx = paddle.grad(
                        outputs=[y],
                        inputs=[x],
                        create_graph=create_graph,
                        retain_graph=True)[0]

                z = y + dx

                # If create_graph = False, the gradient of dx
                # would not be backpropagated. Therefore,
                # z = x * x + dx, and x.gradient() = 2 * x = 2.0

                # If create_graph = True, the gradient of dx
                # would be backpropagated. Therefore,
                # z = x * x + dx = x * x + 2 * x, and
                # x.gradient() = 2 * x + 2 = 4.0

                z.backward()
                return x.gradient()

            print(test_dygraph_grad(create_graph=False)) # [2.]
            print(test_dygraph_grad(create_graph=True)) # [4.]

    Examples 2:
        .. code-block:: python

            import paddle

            def test_dygraph_grad(grad_outputs=None):
                x = paddle.to_tensor(2.0)
                x.stop_gradient = False

                y1 = x * x
                y2 = x * 3 

                # If grad_outputs=None, dy1 = [1], dy2 = [1].
                # If grad_outputs=[g1, g2], then:
                #    - dy1 = [1] if g1 is None else g1
                #    - dy2 = [1] if g2 is None else g2

                # Since y1 = x * x, dx = 2 * x * dy1.
                # Since y2 = x * 3, dx = 3 * dy2.
                # Therefore, the final result would be:
                # dx = 2 * x * dy1 + 3 * dy2 = 4 * dy1 + 3 * dy2.

                dx = paddle.grad(
                    outputs=[y1, y2], 
                    inputs=[x],
                    grad_outputs=grad_outputs)[0]

                return dx.numpy()

            grad_value = paddle.to_tensor(4.0)
            # dy1 = [1], dy2 = [1]
            print(test_dygraph_grad(None)) # [7.]

            # dy1 = [1], dy2 = [4]
            print(test_dygraph_grad([None, grad_value])) # [16.]

            # dy1 = [4], dy2 = [1]
            print(test_dygraph_grad([grad_value, None])) # [19.]

            # dy1 = [3], dy2 = [4]
            grad_y1 = paddle.to_tensor(3.0)
            print(test_dygraph_grad([grad_y1, grad_value])) # [24.]
	'''
    def check_in_out(in_out_list, name):
        assert in_out_list is not None, "{} should not be None".format(name)

        if isinstance(in_out_list, (list, tuple)):
            assert len(in_out_list) > 0, "{} cannot be empty".format(name)
            for each_var in in_out_list:
                if _in_eager_without_dygraph_check():
                    assert isinstance(
                        each_var, core.eager.Tensor
                    ), "Elements of {} must be Tensor".format(name)
                else:
                    assert isinstance(
                        each_var, core.VarBase
                    ), "Elements of {} must be Variable".format(name)
            return in_out_list
        else:
            if _in_eager_without_dygraph_check():
                assert isinstance(
                    in_out_list, core.eager.Tensor
                ), "{} must be Tensor or list of Tensor".format(name)
            else:
                assert isinstance(
                    in_out_list, core.VarBase
                ), "{} must be Variable or list of Variable".format(name)
            return [in_out_list]

    outputs = check_in_out(outputs, 'outputs')
    inputs = check_in_out(inputs, 'inputs')

    if grad_outputs is not None:
        if not isinstance(grad_outputs, (list, tuple)):
            grad_outputs = [grad_outputs]

        for each_var in grad_outputs:
            if each_var is not None:
                if _in_eager_without_dygraph_check():
                    assert isinstance(
                        each_var, core.eager.Tensor
                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
                else:
                    assert isinstance(
                        each_var, core.VarBase
                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
    else:
        grad_outputs = []

    if len(grad_outputs) > 0:
        assert len(grad_outputs) == len(
            outputs), "The length of grad_outputs must be equal to outputs"

    if no_grad_vars is None:
        no_grad_vars = []
    elif isinstance(no_grad_vars, (core.VarBase, core.eager.Tensor)):
        no_grad_vars = [no_grad_vars]
    elif isinstance(no_grad_vars, core.eager.Tensor):
        no_grad_vars = [no_grad_vars]
    elif isinstance(no_grad_vars, (list, tuple, set)):
        no_grad_vars = list(no_grad_vars)
        for var in no_grad_vars:
            if _in_eager_without_dygraph_check():
                assert isinstance(
                    var,
                    core.eager.Tensor), "no_grad_vars can only contains Tensor"
            else:
                assert isinstance(
                    var,
                    core.VarBase), "no_grad_vars can only contains Variable"
    else:
        if _in_eager_without_dygraph_check():
            raise AssertionError(
                "no_grad_vars must be None, Tensor or list/tuple/set of Tensors"
            )
        else:
            raise AssertionError(
                "no_grad_vars must be None, Variable or list/tuple/set of Variables"
            )

    assert isinstance(create_graph, bool), "create_graph must be True or False"

    if retain_graph is None:
        retain_graph = create_graph

    assert isinstance(retain_graph,
                      bool), "retain_graph must be None, True or False"

    assert isinstance(allow_unused, bool), "allow_unused must be True or False"

    assert isinstance(only_inputs, bool), "only_inputs must be True or False"
    assert only_inputs, "only_inputs=False is not supported yet"

    if _in_eager_without_dygraph_check():
        return core.eager.run_partial_grad(outputs, inputs, grad_outputs,
                                           retain_graph, create_graph,
                                           only_inputs, allow_unused,
                                           no_grad_vars)
    else:
        place = core.Place()
        place.set_place(framework._current_expected_place())
        return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
                                         no_grad_vars, place, create_graph,
                                         retain_graph, allow_unused,
                                         only_inputs)
Example #13
0
 def setUp(self):
     place = paddle.CUDAPlace(
         0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
     self.place = core.Place()
     self.place.set_place(place)
Example #14
0
    def test(self):
        p = core.Place()
        p.set_place(paddle.NPUPlace(0))

        self.assertTrue(p.is_npu_place())
        self.assertEqual(p.npu_device_id(), 0)