Beispiel #1
0
 def __init__(self,
              name=None,
              channel_num=None,
              quant_bits=8,
              quant_axis=0,
              dtype='float32',
              quant_on_weight=False):
     assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
     super(FakeQuantChannelWiseAbsMax, self).__init__()
     self._quant_bits = quant_bits
     self._quant_axis = quant_axis
     self._dtype = dtype
     self._name = name
     self._channel_num = channel_num
     scale_prefix = "{}.scale".format(
         name) if name else 'quant_dequant.scale'
     self._scale_name = unique_name.generate(scale_prefix)
     if quant_on_weight:
         scale_attr = ParamAttr(
             name=self._scale_name,
             initializer=Constant(0.0),
             trainable=False)
         self._scale = self.create_parameter(
             shape=[self._channel_num], attr=scale_attr, dtype=self._dtype)
         self._scale.stop_gradient = True
     else:
         self._scale = None
Beispiel #2
0
    def _margin_softmax(input, label, out_dim, param_attr, margin1, margin2,
                        margin3, scale, sample_ratio):
        input_norm = paddle.sqrt(
            paddle.sum(paddle.square(input), axis=1, keepdim=True))
        input = paddle.divide(input, input_norm)

        if param_attr is None:
            param_attr = paddle.ParamAttr(
                initializer=paddle.nn.initializer.XavierNormal(fan_in=0.0))
        weight = paddle.static.create_parameter(
            shape=[input.shape[1], out_dim],
            dtype='float32',
            name=unique_name.generate('final_fc_w'),
            attr=param_attr)

        if sample_ratio < 1.0:
            # partial fc sample process
            label, sampled_class_index = class_center_sample(
                label, out_dim, ratio=sample_ratio, ignore_label=-1)
            sampled_class_index.stop_gradient = True
            weight = paddle.gather(weight, sampled_class_index, axis=1)
            out_dim = paddle.shape(sampled_class_index)

        weight_norm = paddle.sqrt(
            paddle.sum(paddle.square(weight), axis=0, keepdim=True))
        weight = paddle.divide(weight, weight_norm)
        cos = paddle.matmul(input, weight)

        theta = paddle.acos(cos)
        if margin1 != 1.0:
            theta = margin1 * theta
        if margin2 != 0.0:
            theta = theta + margin2
        margin_cos = paddle.cos(theta)
        if margin3 != 0.0:
            margin_cos = margin_cos - margin3

        one_hot = paddle.nn.functional.one_hot(label, num_classes=out_dim)
        diff = paddle.multiply(paddle.subtract(margin_cos, cos), one_hot)
        target_cos = paddle.add(cos, diff)
        logit = paddle.scale(target_cos, scale=scale)

        loss, prob = paddle.nn.functional.softmax_with_cross_entropy(
            logits=logit,
            label=paddle.reshape(label, (-1, 1)),
            return_softmax=True)
        avg_loss = paddle.mean(x=loss)

        one_hot.stop_gradient = True

        return avg_loss, prob
Beispiel #3
0
    def __init__(self,
                 name=None,
                 moving_rate=0.9,
                 quant_bits=8,
                 dtype='float32'):
        super(FakeQuantMovingAverageAbsMax, self).__init__()
        self._moving_rate = moving_rate
        self._quant_bits = quant_bits

        scale_prefix = "{}.scale".format(
            name) if name else 'quant_dequant.scale'
        scale_attr = ParamAttr(
            name=unique_name.generate(scale_prefix),
            initializer=Constant(0.001),
            trainable=False)
        self._scale = self.create_parameter(
            shape=[1], attr=scale_attr, dtype=dtype)
        self._scale.stop_gradient = True

        state_prefix = "{}.state".format(
            name) if name else 'quant_dequant.state'
        state_attr = ParamAttr(
            name=unique_name.generate(state_prefix),
            initializer=Constant(1),
            trainable=False)
        self._state = self.create_parameter(
            shape=[1], attr=state_attr, dtype=dtype)
        self._state.stop_gradient = True

        accum_prefix = "{}.accum".format(
            name) if name else 'quant_dequant.accum'
        accum_attr = ParamAttr(
            name=unique_name.generate(accum_prefix),
            initializer=Constant(1),
            trainable=False)
        self._accum = self.create_parameter(
            shape=[1], attr=accum_attr, dtype=dtype)
        self._accum.stop_gradient = True
Beispiel #4
0
    def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
        r"""
        MovingAverageMaxScale layer is used to calculating the output quantization
        scale of Layer. Its computational formula is described as below:

        :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
        :math:`Out = X`
        """
        super(MovingAverageAbsMaxScale, self).__init__()
        self._moving_rate = moving_rate

        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
        scale_name = unique_name.generate(scale_prefix)
        scale_attr = ParamAttr(
            name=scale_name, initializer=Constant(0), trainable=False)
        self._scale = self.create_parameter(
            shape=[1], attr=scale_attr, dtype=dtype)
        self._scale.stop_gradient = True

        state_prefix = "{}.state".format(name) if name else 'outscale.state'
        state_attr = ParamAttr(
            name=unique_name.generate(state_prefix),
            initializer=Constant(0),
            trainable=False)
        self._state = self.create_parameter(
            shape=[1], attr=state_attr, dtype=dtype)
        self._state.stop_gradient = True

        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
        accum_attr = ParamAttr(
            name=unique_name.generate(accum_prefix),
            initializer=Constant(0),
            trainable=False)
        self._accum = self.create_parameter(
            shape=[1], attr=accum_attr, dtype=dtype)
        self._accum.stop_gradient = True
Beispiel #5
0
 def __init__(self,
              name=None,
              quant_bits=8,
              dtype='float32',
              quant_on_weight=False):
     super(FakeQuantAbsMax, self).__init__()
     self._quant_bits = quant_bits
     self._name = name
     scale_prefix = "{}.scale".format(
         name) if name else 'quant_dequant.scale'
     self._scale_name = unique_name.generate(scale_prefix)
     if quant_on_weight:
         scale_attr = ParamAttr(
             name=self._scale_name,
             initializer=Constant(0.001),
             trainable=False)
         self._scale = self.create_parameter(
             shape=[1], attr=scale_attr, dtype=self._dtype)
         self._scale.stop_gradient = True
     else:
         self._scale = None
Beispiel #6
0
def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                  dist_context):
    """Parse op desc sequence and insert op in the block"""
    global HAS_SENT
    global HAS_RECV
    global HAS_ALLGATHER
    tensor_list = []
    partition_tensor_list = []
    if rank_id not in op_desc_seq.keys():
        return
    op_desc_list = op_desc_seq[rank_id]
    block = program.global_block()
    assert var_name in block.vars.keys(
    ), "The {} cannot be found in the {} program.".format(var_name, rank_id)

    idx = None
    for index, op in list(enumerate(block.ops)):
        if op.desc.id == reshard_op.desc.id:
            idx = index
            break
    assert idx is not None, "The op for reshard cannot be found in the rank {} program.".format(
        rank_id)

    matched_op = block.ops[idx]
    source_tensor = block.vars[var_name]
    for op_desc in op_desc_list:
        if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
            if var_name not in HAS_ALLGATHER.keys():
                HAS_ALLGATHER[var_name] = []
            if not HAS_ALLGATHER[var_name] or op_desc.group not in list(
                    map(lambda x: x[0], HAS_ALLGATHER[var_name])):
                tensor_list, idx_offset = _insert_allgather_op(
                    block, idx, source_tensor, op_desc.group)
                idx += idx_offset
                tensor_name_list = [var.name for var in tensor_list]
                HAS_ALLGATHER[var_name].append(
                    [op_desc.group, tensor_name_list])
            else:
                for item in HAS_ALLGATHER[var_name]:
                    if op_desc.group == item[0]:
                        tensor_list = [
                            program.global_block().vars[var_name]
                            for var_name in item[1]
                        ]
                        break
            assert tensor_list, "The result of parsing allgather op should not be None."

        elif isinstance(op_desc, SendOpDesc):
            if var_name not in HAS_SENT.keys():
                HAS_SENT[var_name] = []
            if op_desc.dst not in HAS_SENT[var_name]:
                _insert_send_op(block, idx, source_tensor, op_desc.dst)
                idx += 1
                HAS_SENT[var_name].append(op_desc.dst)

        elif isinstance(op_desc, RecvOpDesc):
            if var_name not in HAS_RECV.keys():
                HAS_RECV[var_name] = {}
            if op_desc.src not in HAS_RECV[var_name].keys():
                partition_index = op_desc.partition_index
                shape = []
                for index in partition_index:
                    shape.append(index[1] - index[0])
                recv_tensor = block.create_var(
                    name=unique_name.generate(var_name + "@recv"),
                    shape=shape,
                    dtype=source_tensor.dtype)
                _insert_recv_op(block, idx, recv_tensor, op_desc.src)
                tensor_list.append(recv_tensor)
                idx += 1
                HAS_RECV[var_name][op_desc.src] = recv_tensor
            else:
                tensor_list.append(HAS_RECV[var_name][op_desc.src])

        elif isinstance(op_desc, ConcatOpDesc):
            partition_index_list = op_desc.partition_index_list
            idx_list = [idx]
            for index, tensor in enumerate(tensor_list):
                _concat_partitions_with_op(partition_tensor_list, tensor,
                                           partition_index_list[index], block,
                                           idx_list)
            idx = idx_list[0]

        elif isinstance(op_desc, SliceOpDesc):
            assert len(partition_tensor_list) == 1 or not partition_tensor_list
            to_slice_tensor = partition_tensor_list[0][0] if len(
                partition_tensor_list) == 1 else source_tensor
            new_name = unique_name.generate(var_name + "@RESHARD")
            target_tensor = _insert_slice_op(block,
                                             idx,
                                             to_slice_tensor,
                                             starts=op_desc.starts,
                                             ends=op_desc.ends,
                                             axes=op_desc.axes,
                                             new_var_name=new_name)

            tensor_attr = TensorDistributedAttribute()
            process_mesh = dist_context.get_op_dist_attr_for_program(
                matched_op).process_mesh
            dims_mapping = dist_context.get_op_dist_attr_for_program(
                matched_op).get_input_dims_mapping(var_name)
            tensor_attr.dims_mapping = dims_mapping
            tensor_attr.process_mesh = process_mesh
            dist_context.set_tensor_dist_attr_for_program(
                target_tensor, tensor_attr)

            # rename op input name according to new name
            for op in block.ops:
                for name in op.input_arg_names:
                    op_dist_attr = dist_context.get_op_dist_attr_for_program(
                        op)
                    if name == var_name and op_dist_attr is not None:
                        op_process_mesh = op_dist_attr.process_mesh
                        op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
                            var_name)
                        if op_process_mesh == process_mesh and op_input_dims_mapping == dims_mapping:
                            op.desc._rename_input(name, target_tensor.name)
                            op_dist_attr.set_input_dims_mapping(
                                target_tensor.name, dims_mapping)
                            op_dist_attr.set_input_dist_attr(name, None)
Beispiel #7
0
    def __call__(self, var, block=None):
        """Initialize the input tensor with dirac initializer.

        Args:
            var(Tensor): Tensor that needs to be initialized.
            block(Block, optional): The block in which initialization ops
                   should be added. Used in static graph only, default None.

        Returns:
            The most critical OP(scatter) in this initializer, which contains 7~8 ops in total.
        """
        block = self._check_block(block)
        assert isinstance(var, framework.Parameter)
        assert isinstance(block, framework.Block)
        check_variable_and_dtype(var, "Out",
                                 ['float16', 'bfloat16', 'float32', 'float64'],
                                 'Dirac')

        assert len(var.shape) in [
            3, 4, 5
        ], "Only Tensor with 3/4/5 dimensions can be initialized by Dirac"
        assert (var.shape[0] % self._groups
                ) == 0, "Tensor 0-dimension must be divisible by groups"

        if var.dtype != VarDesc.VarType.FP32:
            out_var = block.create_var(name=unique_name.generate(".".join(
                ['dirac', var.name, 'tmp'])),
                                       shape=var.shape,
                                       dtype=VarDesc.VarType.FP32,
                                       type=VarDesc.VarType.LOD_TENSOR,
                                       persistable=False)
        else:
            out_var = var
        op = None
        if framework.in_dygraph_mode():
            with fluid.dygraph.no_grad():
                _C_ops.fill_constant(out_var, 'value', float(0), 'force_cpu',
                                     False,
                                     'dtype', out_var.dtype, 'str_value',
                                     str(float(0)), 'shape', out_var.shape)
        else:
            block.append_op(type='fill_constant',
                            inputs={},
                            outputs={'Out': out_var},
                            attrs={
                                'value': float(0),
                                'dtype': out_var.dtype,
                                'shape': out_var.shape,
                            },
                            stop_gradient=True)

        origin_shape = var.shape
        num_per_group = origin_shape[0] // self._groups
        min_shape = min(num_per_group, origin_shape[1])

        idx_list = []
        value_list = []
        strides = []
        prod = 1
        for dim in reversed(origin_shape):
            strides.insert(0, prod)
            prod *= dim
        for i in range(self._groups):
            for j in range(min_shape):
                value_list.append(1.0)
                offset = 0
                for (k, stride) in enumerate(strides):
                    if (k == 0):
                        offset += (j + i * num_per_group) * stride
                    elif (k == 1):
                        offset += j * stride
                    else:
                        offset += origin_shape[k] // 2 * stride
                idx_list.append(offset)
        if framework.in_dygraph_mode():
            with fluid.dygraph.no_grad():
                tmp_out, _ = _C_ops.reshape2(out_var, None, 'shape', [-1])
                tmp_out._share_underline_tensor_to(out_var)
        else:
            x_shape = block.create_var(name=unique_name.generate(".".join(
                [out_var.name, "XShape"])),
                                       dtype=out_var.dtype,
                                       shape=out_var.shape,
                                       type=VarDesc.VarType.LOD_TENSOR,
                                       persistable=False,
                                       stop_gradient=True)
            block.append_op(type="reshape2",
                            inputs={"X": out_var},
                            attrs={'shape': [-1]},
                            outputs={
                                "Out": out_var,
                                "XShape": x_shape
                            },
                            stop_gradient=True)

        index_tensor = block.create_var(
            name=unique_name.generate('scatter_index'),
            persistable=False,
            stop_gradient=True)

        if framework.in_dygraph_mode():
            with fluid.dygraph.no_grad():
                tmp_tensor = framework._varbase_creator()
                _C_ops.assign_value(tmp_tensor, 'shape', [len(idx_list)],
                                    'dtype', VarDesc.VarType.INT64,
                                    'int64_values', idx_list)
                tmp_tensor._share_underline_tensor_to(index_tensor)
        else:
            block.append_op(type='assign_value',
                            outputs={'Out': index_tensor},
                            attrs={
                                'dtype': VarDesc.VarType.INT64,
                                'shape': [len(idx_list)],
                                'int64_values': idx_list
                            },
                            stop_gradient=True)

        value_tensor = block.create_var(
            name=unique_name.generate('scatter_value'),
            persistable=False,
            stop_gradient=True)

        if framework.in_dygraph_mode():
            with fluid.dygraph.no_grad():
                tmp_tensor = framework._varbase_creator()
                _C_ops.assign_value(tmp_tensor, 'shape', [len(value_list)],
                                    'dtype', VarDesc.VarType.FP32,
                                    'fp32_values', value_list)
                tmp_tensor._share_underline_tensor_to(value_tensor)
        else:
            block.append_op(type='assign_value',
                            outputs={'Out': value_tensor},
                            attrs={
                                'dtype': VarDesc.VarType.FP32,
                                'shape': [len(value_list)],
                                'fp32_values': value_list
                            },
                            stop_gradient=True)

        if framework.in_dygraph_mode():
            with fluid.dygraph.no_grad():
                tmp_out = _C_ops.final_state_scatter(out_var, index_tensor,
                                                     value_tensor, True)
                tmp_out._share_underline_tensor_to(out_var)
                tmp_reshape_out, _ = _C_ops.reshape2(out_var, None, 'shape',
                                                     origin_shape)
                tmp_reshape_out._share_underline_tensor_to(out_var)
                if var.dtype != VarDesc.VarType.FP32:
                    tmp_cast_out = _C_ops.cast(out_var, 'in_dtype',
                                               out_var.dtype, 'out_dtype',
                                               var.dtype)
                    tmp_cast_out._share_underline_tensor_to(var)

        else:
            op = block.append_op(type="scatter",
                                 inputs={
                                     "X": out_var,
                                     "Ids": index_tensor,
                                     "Updates": value_tensor
                                 },
                                 attrs={'overwrite': True},
                                 outputs={"Out": out_var},
                                 stop_gradient=True)
            x_shape = block.create_var(name=unique_name.generate(".".join(
                [out_var.name, "XShape"])),
                                       dtype=out_var.dtype,
                                       shape=out_var.shape,
                                       type=VarDesc.VarType.LOD_TENSOR,
                                       persistable=False,
                                       stop_gradient=True)
            block.append_op(type="reshape2",
                            inputs={"X": out_var},
                            attrs={'shape': origin_shape},
                            outputs={
                                "Out": out_var,
                                "XShape": x_shape
                            },
                            stop_gradient=True)
            if var.dtype != VarDesc.VarType.FP32:
                block.append_op(type="cast",
                                inputs={"X": out_var},
                                outputs={"Out": var},
                                attrs={
                                    "in_dtype": out_var.dtype,
                                    "out_dtype": var.dtype
                                },
                                stop_gradient=True)
        if not in_dynamic_mode():
            var.op = op
        return op
Beispiel #8
0
    def __call__(self, var, block=None):
        """Initialize the input tensor with orthogonal initializer.

        Args:
            var(Tensor): Tensor that needs to be initialized.
            block(Block, optional): The block in which initialization ops
                   should be added. Used in static graph only, default None.

        Returns:
            The last initialization op, it contain 8 ops in orthogonal initializer.
        """
        block = self._check_block(block)
        assert isinstance(var, framework.Parameter)
        assert isinstance(block, framework.Block)
        # 'qr' op only support float32/float64 now
        check_variable_and_dtype(var, "Out", ["float32", "float64"],
                                 "Orthogonal")

        self._seed = block.program.random_seed

        shape = var.shape
        assert len(
            shape
        ) >= 2, "Only Tensor with 2 or more dimensions can be initialized by Orthogonal"

        row = shape[0]
        col = 1
        for i in shape[1:]:
            col *= i

        flatten_shape = [max(row, col), min(row, col)]

        normal_var = block.create_var(name=unique_name.generate('.'.join(
            ['gaussian_random', 'tmp'])),
                                      dtype=var.dtype,
                                      persistable=False,
                                      stop_gradient=True)
        block.append_op(type='gaussian_random',
                        inputs={},
                        outputs={'Out': normal_var},
                        attrs={
                            'mean': 0.0,
                            'std': 1.0,
                            'shape': flatten_shape,
                            'seed': self._seed,
                            'dtype': var.dtype
                        },
                        stop_gradient=True)

        q = block.create_var(name=unique_name.generate('.'.join(
            ['qr', 'q', 'tmp'])),
                             dtype=normal_var.dtype,
                             persistable=False,
                             stop_gradient=True)
        r = block.create_var(name=unique_name.generate('.'.join(
            ['qr', 'r', 'tmp'])),
                             dtype=normal_var.dtype,
                             persistable=False,
                             stop_gradient=True)
        block.append_op(type='qr',
                        inputs={'X': [normal_var]},
                        outputs={
                            'Q': q,
                            'R': r,
                        },
                        attrs={'mode': 'reduced'},
                        stop_gradient=True)

        r_diag = block.create_var(name=unique_name.generate('.'.join(
            ['diag', 'tmp'])),
                                  dtype=r.dtype,
                                  persistable=False,
                                  stop_gradient=True)
        block.append_op(type='diag_v2',
                        inputs={'X': r},
                        outputs={'Out': r_diag},
                        attrs={
                            'offset': 0,
                            'padding_value': 0
                        },
                        stop_gradient=True)

        r_sign = r_diag
        block.append_op(type='sign',
                        inputs={'X': [r_diag]},
                        outputs={'Out': r_sign},
                        stop_gradient=True)

        block.append_op(type='elementwise_mul',
                        inputs={
                            'X': q,
                            'Y': r_sign
                        },
                        outputs={'Out': q},
                        attrs={},
                        stop_gradient=True)

        x_shape = block.create_var(name=unique_name.generate('.'.join(
            ['transpose', 'shape', 'tmp'])),
                                   dtype=q.dtype,
                                   persistable=False,
                                   stop_gradient=True)
        if row < col:
            q_transpose = block.create_var(name=unique_name.generate('.'.join(
                ['transpose', 'tmp'])),
                                           dtype=q.dtype,
                                           persistable=False,
                                           stop_gradient=True)
            block.append_op(type='transpose2',
                            inputs={'X': q},
                            outputs={
                                'Out': q_transpose,
                                'XShape': x_shape
                            },
                            attrs={'axis': [1, 0]},
                            stop_gradient=True)
            q = q_transpose

        block.append_op(type='reshape2',
                        inputs={'X': q},
                        outputs={
                            'Out': q,
                            "XShape": x_shape
                        },
                        attrs={'shape': var.shape},
                        stop_gradient=True)

        op = block.append_op(type='scale',
                             inputs={'X': q},
                             outputs={'Out': var},
                             attrs={
                                 'scale': self._gain,
                                 'bias': 0.0
                             })

        return op