def __init__(self, name=None, channel_num=None, quant_bits=8, quant_axis=0, dtype='float32', quant_on_weight=False): assert quant_on_weight == True, "Channel_wise only can be used on weight quantization." super(FakeQuantChannelWiseAbsMax, self).__init__() self._quant_bits = quant_bits self._quant_axis = quant_axis self._dtype = dtype self._name = name self._channel_num = channel_num scale_prefix = "{}.scale".format( name) if name else 'quant_dequant.scale' self._scale_name = unique_name.generate(scale_prefix) if quant_on_weight: scale_attr = ParamAttr( name=self._scale_name, initializer=Constant(0.0), trainable=False) self._scale = self.create_parameter( shape=[self._channel_num], attr=scale_attr, dtype=self._dtype) self._scale.stop_gradient = True else: self._scale = None
def _margin_softmax(input, label, out_dim, param_attr, margin1, margin2, margin3, scale, sample_ratio): input_norm = paddle.sqrt( paddle.sum(paddle.square(input), axis=1, keepdim=True)) input = paddle.divide(input, input_norm) if param_attr is None: param_attr = paddle.ParamAttr( initializer=paddle.nn.initializer.XavierNormal(fan_in=0.0)) weight = paddle.static.create_parameter( shape=[input.shape[1], out_dim], dtype='float32', name=unique_name.generate('final_fc_w'), attr=param_attr) if sample_ratio < 1.0: # partial fc sample process label, sampled_class_index = class_center_sample( label, out_dim, ratio=sample_ratio, ignore_label=-1) sampled_class_index.stop_gradient = True weight = paddle.gather(weight, sampled_class_index, axis=1) out_dim = paddle.shape(sampled_class_index) weight_norm = paddle.sqrt( paddle.sum(paddle.square(weight), axis=0, keepdim=True)) weight = paddle.divide(weight, weight_norm) cos = paddle.matmul(input, weight) theta = paddle.acos(cos) if margin1 != 1.0: theta = margin1 * theta if margin2 != 0.0: theta = theta + margin2 margin_cos = paddle.cos(theta) if margin3 != 0.0: margin_cos = margin_cos - margin3 one_hot = paddle.nn.functional.one_hot(label, num_classes=out_dim) diff = paddle.multiply(paddle.subtract(margin_cos, cos), one_hot) target_cos = paddle.add(cos, diff) logit = paddle.scale(target_cos, scale=scale) loss, prob = paddle.nn.functional.softmax_with_cross_entropy( logits=logit, label=paddle.reshape(label, (-1, 1)), return_softmax=True) avg_loss = paddle.mean(x=loss) one_hot.stop_gradient = True return avg_loss, prob
def __init__(self, name=None, moving_rate=0.9, quant_bits=8, dtype='float32'): super(FakeQuantMovingAverageAbsMax, self).__init__() self._moving_rate = moving_rate self._quant_bits = quant_bits scale_prefix = "{}.scale".format( name) if name else 'quant_dequant.scale' scale_attr = ParamAttr( name=unique_name.generate(scale_prefix), initializer=Constant(0.001), trainable=False) self._scale = self.create_parameter( shape=[1], attr=scale_attr, dtype=dtype) self._scale.stop_gradient = True state_prefix = "{}.state".format( name) if name else 'quant_dequant.state' state_attr = ParamAttr( name=unique_name.generate(state_prefix), initializer=Constant(1), trainable=False) self._state = self.create_parameter( shape=[1], attr=state_attr, dtype=dtype) self._state.stop_gradient = True accum_prefix = "{}.accum".format( name) if name else 'quant_dequant.accum' accum_attr = ParamAttr( name=unique_name.generate(accum_prefix), initializer=Constant(1), trainable=False) self._accum = self.create_parameter( shape=[1], attr=accum_attr, dtype=dtype) self._accum.stop_gradient = True
def __init__(self, name=None, moving_rate=0.9, dtype='float32'): r""" MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer. Its computational formula is described as below: :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)` :math:`Out = X` """ super(MovingAverageAbsMaxScale, self).__init__() self._moving_rate = moving_rate scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' scale_name = unique_name.generate(scale_prefix) scale_attr = ParamAttr( name=scale_name, initializer=Constant(0), trainable=False) self._scale = self.create_parameter( shape=[1], attr=scale_attr, dtype=dtype) self._scale.stop_gradient = True state_prefix = "{}.state".format(name) if name else 'outscale.state' state_attr = ParamAttr( name=unique_name.generate(state_prefix), initializer=Constant(0), trainable=False) self._state = self.create_parameter( shape=[1], attr=state_attr, dtype=dtype) self._state.stop_gradient = True accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' accum_attr = ParamAttr( name=unique_name.generate(accum_prefix), initializer=Constant(0), trainable=False) self._accum = self.create_parameter( shape=[1], attr=accum_attr, dtype=dtype) self._accum.stop_gradient = True
def __init__(self, name=None, quant_bits=8, dtype='float32', quant_on_weight=False): super(FakeQuantAbsMax, self).__init__() self._quant_bits = quant_bits self._name = name scale_prefix = "{}.scale".format( name) if name else 'quant_dequant.scale' self._scale_name = unique_name.generate(scale_prefix) if quant_on_weight: scale_attr = ParamAttr( name=self._scale_name, initializer=Constant(0.001), trainable=False) self._scale = self.create_parameter( shape=[1], attr=scale_attr, dtype=self._dtype) self._scale.stop_gradient = True else: self._scale = None
def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op, dist_context): """Parse op desc sequence and insert op in the block""" global HAS_SENT global HAS_RECV global HAS_ALLGATHER tensor_list = [] partition_tensor_list = [] if rank_id not in op_desc_seq.keys(): return op_desc_list = op_desc_seq[rank_id] block = program.global_block() assert var_name in block.vars.keys( ), "The {} cannot be found in the {} program.".format(var_name, rank_id) idx = None for index, op in list(enumerate(block.ops)): if op.desc.id == reshard_op.desc.id: idx = index break assert idx is not None, "The op for reshard cannot be found in the rank {} program.".format( rank_id) matched_op = block.ops[idx] source_tensor = block.vars[var_name] for op_desc in op_desc_list: if isinstance(op_desc, AllGatherOpDesc): # noqa: F401 if var_name not in HAS_ALLGATHER.keys(): HAS_ALLGATHER[var_name] = [] if not HAS_ALLGATHER[var_name] or op_desc.group not in list( map(lambda x: x[0], HAS_ALLGATHER[var_name])): tensor_list, idx_offset = _insert_allgather_op( block, idx, source_tensor, op_desc.group) idx += idx_offset tensor_name_list = [var.name for var in tensor_list] HAS_ALLGATHER[var_name].append( [op_desc.group, tensor_name_list]) else: for item in HAS_ALLGATHER[var_name]: if op_desc.group == item[0]: tensor_list = [ program.global_block().vars[var_name] for var_name in item[1] ] break assert tensor_list, "The result of parsing allgather op should not be None." elif isinstance(op_desc, SendOpDesc): if var_name not in HAS_SENT.keys(): HAS_SENT[var_name] = [] if op_desc.dst not in HAS_SENT[var_name]: _insert_send_op(block, idx, source_tensor, op_desc.dst) idx += 1 HAS_SENT[var_name].append(op_desc.dst) elif isinstance(op_desc, RecvOpDesc): if var_name not in HAS_RECV.keys(): HAS_RECV[var_name] = {} if op_desc.src not in HAS_RECV[var_name].keys(): partition_index = op_desc.partition_index shape = [] for index in partition_index: shape.append(index[1] - index[0]) recv_tensor = block.create_var( name=unique_name.generate(var_name + "@recv"), shape=shape, dtype=source_tensor.dtype) _insert_recv_op(block, idx, recv_tensor, op_desc.src) tensor_list.append(recv_tensor) idx += 1 HAS_RECV[var_name][op_desc.src] = recv_tensor else: tensor_list.append(HAS_RECV[var_name][op_desc.src]) elif isinstance(op_desc, ConcatOpDesc): partition_index_list = op_desc.partition_index_list idx_list = [idx] for index, tensor in enumerate(tensor_list): _concat_partitions_with_op(partition_tensor_list, tensor, partition_index_list[index], block, idx_list) idx = idx_list[0] elif isinstance(op_desc, SliceOpDesc): assert len(partition_tensor_list) == 1 or not partition_tensor_list to_slice_tensor = partition_tensor_list[0][0] if len( partition_tensor_list) == 1 else source_tensor new_name = unique_name.generate(var_name + "@RESHARD") target_tensor = _insert_slice_op(block, idx, to_slice_tensor, starts=op_desc.starts, ends=op_desc.ends, axes=op_desc.axes, new_var_name=new_name) tensor_attr = TensorDistributedAttribute() process_mesh = dist_context.get_op_dist_attr_for_program( matched_op).process_mesh dims_mapping = dist_context.get_op_dist_attr_for_program( matched_op).get_input_dims_mapping(var_name) tensor_attr.dims_mapping = dims_mapping tensor_attr.process_mesh = process_mesh dist_context.set_tensor_dist_attr_for_program( target_tensor, tensor_attr) # rename op input name according to new name for op in block.ops: for name in op.input_arg_names: op_dist_attr = dist_context.get_op_dist_attr_for_program( op) if name == var_name and op_dist_attr is not None: op_process_mesh = op_dist_attr.process_mesh op_input_dims_mapping = op_dist_attr.get_input_dims_mapping( var_name) if op_process_mesh == process_mesh and op_input_dims_mapping == dims_mapping: op.desc._rename_input(name, target_tensor.name) op_dist_attr.set_input_dims_mapping( target_tensor.name, dims_mapping) op_dist_attr.set_input_dist_attr(name, None)
def __call__(self, var, block=None): """Initialize the input tensor with dirac initializer. Args: var(Tensor): Tensor that needs to be initialized. block(Block, optional): The block in which initialization ops should be added. Used in static graph only, default None. Returns: The most critical OP(scatter) in this initializer, which contains 7~8 ops in total. """ block = self._check_block(block) assert isinstance(var, framework.Parameter) assert isinstance(block, framework.Block) check_variable_and_dtype(var, "Out", ['float16', 'bfloat16', 'float32', 'float64'], 'Dirac') assert len(var.shape) in [ 3, 4, 5 ], "Only Tensor with 3/4/5 dimensions can be initialized by Dirac" assert (var.shape[0] % self._groups ) == 0, "Tensor 0-dimension must be divisible by groups" if var.dtype != VarDesc.VarType.FP32: out_var = block.create_var(name=unique_name.generate(".".join( ['dirac', var.name, 'tmp'])), shape=var.shape, dtype=VarDesc.VarType.FP32, type=VarDesc.VarType.LOD_TENSOR, persistable=False) else: out_var = var op = None if framework.in_dygraph_mode(): with fluid.dygraph.no_grad(): _C_ops.fill_constant(out_var, 'value', float(0), 'force_cpu', False, 'dtype', out_var.dtype, 'str_value', str(float(0)), 'shape', out_var.shape) else: block.append_op(type='fill_constant', inputs={}, outputs={'Out': out_var}, attrs={ 'value': float(0), 'dtype': out_var.dtype, 'shape': out_var.shape, }, stop_gradient=True) origin_shape = var.shape num_per_group = origin_shape[0] // self._groups min_shape = min(num_per_group, origin_shape[1]) idx_list = [] value_list = [] strides = [] prod = 1 for dim in reversed(origin_shape): strides.insert(0, prod) prod *= dim for i in range(self._groups): for j in range(min_shape): value_list.append(1.0) offset = 0 for (k, stride) in enumerate(strides): if (k == 0): offset += (j + i * num_per_group) * stride elif (k == 1): offset += j * stride else: offset += origin_shape[k] // 2 * stride idx_list.append(offset) if framework.in_dygraph_mode(): with fluid.dygraph.no_grad(): tmp_out, _ = _C_ops.reshape2(out_var, None, 'shape', [-1]) tmp_out._share_underline_tensor_to(out_var) else: x_shape = block.create_var(name=unique_name.generate(".".join( [out_var.name, "XShape"])), dtype=out_var.dtype, shape=out_var.shape, type=VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=True) block.append_op(type="reshape2", inputs={"X": out_var}, attrs={'shape': [-1]}, outputs={ "Out": out_var, "XShape": x_shape }, stop_gradient=True) index_tensor = block.create_var( name=unique_name.generate('scatter_index'), persistable=False, stop_gradient=True) if framework.in_dygraph_mode(): with fluid.dygraph.no_grad(): tmp_tensor = framework._varbase_creator() _C_ops.assign_value(tmp_tensor, 'shape', [len(idx_list)], 'dtype', VarDesc.VarType.INT64, 'int64_values', idx_list) tmp_tensor._share_underline_tensor_to(index_tensor) else: block.append_op(type='assign_value', outputs={'Out': index_tensor}, attrs={ 'dtype': VarDesc.VarType.INT64, 'shape': [len(idx_list)], 'int64_values': idx_list }, stop_gradient=True) value_tensor = block.create_var( name=unique_name.generate('scatter_value'), persistable=False, stop_gradient=True) if framework.in_dygraph_mode(): with fluid.dygraph.no_grad(): tmp_tensor = framework._varbase_creator() _C_ops.assign_value(tmp_tensor, 'shape', [len(value_list)], 'dtype', VarDesc.VarType.FP32, 'fp32_values', value_list) tmp_tensor._share_underline_tensor_to(value_tensor) else: block.append_op(type='assign_value', outputs={'Out': value_tensor}, attrs={ 'dtype': VarDesc.VarType.FP32, 'shape': [len(value_list)], 'fp32_values': value_list }, stop_gradient=True) if framework.in_dygraph_mode(): with fluid.dygraph.no_grad(): tmp_out = _C_ops.final_state_scatter(out_var, index_tensor, value_tensor, True) tmp_out._share_underline_tensor_to(out_var) tmp_reshape_out, _ = _C_ops.reshape2(out_var, None, 'shape', origin_shape) tmp_reshape_out._share_underline_tensor_to(out_var) if var.dtype != VarDesc.VarType.FP32: tmp_cast_out = _C_ops.cast(out_var, 'in_dtype', out_var.dtype, 'out_dtype', var.dtype) tmp_cast_out._share_underline_tensor_to(var) else: op = block.append_op(type="scatter", inputs={ "X": out_var, "Ids": index_tensor, "Updates": value_tensor }, attrs={'overwrite': True}, outputs={"Out": out_var}, stop_gradient=True) x_shape = block.create_var(name=unique_name.generate(".".join( [out_var.name, "XShape"])), dtype=out_var.dtype, shape=out_var.shape, type=VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=True) block.append_op(type="reshape2", inputs={"X": out_var}, attrs={'shape': origin_shape}, outputs={ "Out": out_var, "XShape": x_shape }, stop_gradient=True) if var.dtype != VarDesc.VarType.FP32: block.append_op(type="cast", inputs={"X": out_var}, outputs={"Out": var}, attrs={ "in_dtype": out_var.dtype, "out_dtype": var.dtype }, stop_gradient=True) if not in_dynamic_mode(): var.op = op return op
def __call__(self, var, block=None): """Initialize the input tensor with orthogonal initializer. Args: var(Tensor): Tensor that needs to be initialized. block(Block, optional): The block in which initialization ops should be added. Used in static graph only, default None. Returns: The last initialization op, it contain 8 ops in orthogonal initializer. """ block = self._check_block(block) assert isinstance(var, framework.Parameter) assert isinstance(block, framework.Block) # 'qr' op only support float32/float64 now check_variable_and_dtype(var, "Out", ["float32", "float64"], "Orthogonal") self._seed = block.program.random_seed shape = var.shape assert len( shape ) >= 2, "Only Tensor with 2 or more dimensions can be initialized by Orthogonal" row = shape[0] col = 1 for i in shape[1:]: col *= i flatten_shape = [max(row, col), min(row, col)] normal_var = block.create_var(name=unique_name.generate('.'.join( ['gaussian_random', 'tmp'])), dtype=var.dtype, persistable=False, stop_gradient=True) block.append_op(type='gaussian_random', inputs={}, outputs={'Out': normal_var}, attrs={ 'mean': 0.0, 'std': 1.0, 'shape': flatten_shape, 'seed': self._seed, 'dtype': var.dtype }, stop_gradient=True) q = block.create_var(name=unique_name.generate('.'.join( ['qr', 'q', 'tmp'])), dtype=normal_var.dtype, persistable=False, stop_gradient=True) r = block.create_var(name=unique_name.generate('.'.join( ['qr', 'r', 'tmp'])), dtype=normal_var.dtype, persistable=False, stop_gradient=True) block.append_op(type='qr', inputs={'X': [normal_var]}, outputs={ 'Q': q, 'R': r, }, attrs={'mode': 'reduced'}, stop_gradient=True) r_diag = block.create_var(name=unique_name.generate('.'.join( ['diag', 'tmp'])), dtype=r.dtype, persistable=False, stop_gradient=True) block.append_op(type='diag_v2', inputs={'X': r}, outputs={'Out': r_diag}, attrs={ 'offset': 0, 'padding_value': 0 }, stop_gradient=True) r_sign = r_diag block.append_op(type='sign', inputs={'X': [r_diag]}, outputs={'Out': r_sign}, stop_gradient=True) block.append_op(type='elementwise_mul', inputs={ 'X': q, 'Y': r_sign }, outputs={'Out': q}, attrs={}, stop_gradient=True) x_shape = block.create_var(name=unique_name.generate('.'.join( ['transpose', 'shape', 'tmp'])), dtype=q.dtype, persistable=False, stop_gradient=True) if row < col: q_transpose = block.create_var(name=unique_name.generate('.'.join( ['transpose', 'tmp'])), dtype=q.dtype, persistable=False, stop_gradient=True) block.append_op(type='transpose2', inputs={'X': q}, outputs={ 'Out': q_transpose, 'XShape': x_shape }, attrs={'axis': [1, 0]}, stop_gradient=True) q = q_transpose block.append_op(type='reshape2', inputs={'X': q}, outputs={ 'Out': q, "XShape": x_shape }, attrs={'shape': var.shape}, stop_gradient=True) op = block.append_op(type='scale', inputs={'X': q}, outputs={'Out': var}, attrs={ 'scale': self._gain, 'bias': 0.0 }) return op