def create_quantize_graph(config_file, graph=tf.get_default_graph()): """ 全图量化 :param config: 量化方式配置 :param graph: 需要量化的graph :return: 添加量化算子的graph """ config = Config(config_file) ops = graph.get_operations() quant_tensors = {} # 存放已经量化过的tensor,避免重复量化 quantize_func = quantize_for_train if config.is_training else quantize_for_eval quant_layers = set(tf.get_collection("quant_layers")) tf.logging.info( "-------------------------Quantize-------------------------") # 遍历所有op,对前向算子进行量化 for op in ops: if "gradient" not in op.name and op.type in config.forward_quant_ops and op.name not in quant_layers: inp1, inp2 = op.inputs._inputs tf.logging.info("Forward Quant:%s" % op.name) ctxt = util.GetOutputContext(op) while_ctxt = util.GetContainingWhileContext(ctxt) graph._set_control_flow_context(ctxt) quant_inp1 = quantize_func(inp1, config.get_config(op.type, op.name, "input"))\ if inp1.name not in quant_tensors else quant_tensors[inp1.name] quant_inp2 = quantize_func(inp2, config.get_config(op.type, op.name, "weight")) \ if inp2.name not in quant_tensors else quant_tensors[inp2.name] quant_tensors[inp1.name] = quant_inp1 quant_tensors[inp2.name] = quant_inp2 tf.contrib.graph_editor.reroute_ts([quant_inp1, quant_inp2], op.inputs._inputs, can_modify=op) tf.add_to_collection("quant_layers", op.name) # 遍历所有op,对所有反向算子进行量化 for op in ops: if "gradient" in op.name and op.type in config.backward_quant_ops and op.name not in quant_layers: tf.logging.info("Backward Quant:%s" % op.name) new_inputs = [] for inp in op.inputs._inputs: if "ShapeN" not in inp.name: quant_inp = quantize_func(inp, config.get_config("Gradient", op.name, "gradient")) \ if inp.name not in quant_tensors else quant_tensors[inp.name] quant_tensors[inp.name] = quant_inp new_inputs.append(quant_inp) else: new_inputs.append(inp) tf.contrib.graph_editor.reroute_ts(new_inputs, op.inputs._inputs, can_modify=op) tf.add_to_collection("quant_layers", op.name) graph._set_control_flow_context(None) return graph
def _MergeGrad(op, grad, _): """Gradients for a Merge op are calculated using a Switch op.""" input_op = op.inputs[0].op graph = ops.get_default_graph() # pylint: disable=protected-access op_ctxt = control_flow_util.GetOutputContext(input_op) grad_ctxt = graph._get_control_flow_context() # pylint: enable=protected-access if isinstance(op_ctxt, WhileContext): # pylint: disable=protected-access return control_flow_ops._SwitchRefOrTensor(grad, grad_ctxt.pivot) # pylint: enable=protected-access elif isinstance(op_ctxt, CondContext): pred = op_ctxt.pred if grad_ctxt and grad_ctxt.grad_state: # This Merge node is part of a cond within a loop. # The backprop needs to have the value of this predicate for every # iteration. So we must have its values accumulated in the forward, and # use the accumulated values as the predicate for this backprop switch. grad_state = grad_ctxt.grad_state real_pred = grad_state.history_map.get(pred.name) if real_pred is None: # Remember the value of pred for every iteration. grad_ctxt = grad_state.grad_context grad_ctxt.Exit() history_pred = grad_state.AddForwardAccumulator(pred) grad_ctxt.Enter() # Add the stack pop op. If pred.op is in a (outer) CondContext, # the stack pop will be guarded with a switch. real_pred = grad_state.AddBackpropAccumulatedValue( history_pred, pred) grad_state.history_map[pred.name] = real_pred pred = real_pred # pylint: disable=protected-access return control_flow_ops._SwitchRefOrTensor(grad, pred, name="cond_grad") # pylint: enable=protected-access else: num_inputs = len(op.inputs) cond = [math_ops.equal(op.outputs[1], i) for i in xrange(num_inputs)] # pylint: disable=protected-access return [ control_flow_ops._SwitchRefOrTensor(grad, cond[i])[1] for i in xrange(num_inputs) ]
def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index): """Gradient for concat op. Args: op: An operation. grad: `Tensor` or `IndexedSlices` representing the gradients with respect to each output of the op. start_value_index: An integer index of the first value in the op.inputs. end_value_index: An integer index of the last value in the op.inputs. dim_index: An interger index of concat_dim or axis parameter in op.inputs. Returns: Tensors representing the partial gradients with respect to each input of the op. Raises: ValueError: if concat_dim/axis is not statically known. """ def _CreateDenseMaskAndBegin(sizes, concat_dim): """Create variables for iteratively slicing a dense gradients tensor.""" # Since shape is 1-D, shape_of_shape = [rank-of-inputs] shape_of_shape = array_ops.shape(sizes[0]) # Make a vector of length equal to the input's dimensions, # with 0's everywhere and 1 in the concat dim position. # Note: Can't use sparse_to_dense since it isn't GPU-capable (for now) mask = array_ops.concat([ array_ops.fill(array_ops.expand_dims(concat_dim, 0), 0), [1], array_ops.fill(shape_of_shape - concat_dim - 1, 0) ], 0) begin = array_ops.fill(shape_of_shape, 0) return mask, begin def _ExtractInputShapes(inputs): """Extract the shapes of a set of input tensors.""" if context.executing_eagerly(): return array_ops.shape_n(inputs) sizes = [] fully_known = True for x in inputs: input_shape = array_ops.shape(x) if not isinstance(input_shape, ops.Tensor) or input_shape.op.type != "Const": fully_known = False break sizes.append(input_shape) if fully_known: return sizes else: return array_ops.shape_n(inputs) # Degenerate concatenation, just return grad. if len(op.inputs) == 2: return grad + [None] if end_value_index <= dim_index else [None] + grad concat_dim = op.inputs[dim_index] input_values = op.inputs[start_value_index:end_value_index] out_grads = [] if isinstance(grad, ops.Tensor): if context.executing_eagerly(): # Using mod here for convenience since concat_dim is already verified # in concat implementation to be within the allowed [-rank, rank) range. non_neg_concat_dim = (concat_dim._numpy().item(0) % input_values[0]._rank()) # pylint: disable=protected-access # All inputs are guaranteed to be EagerTensors in eager mode sizes = pywrap_tensorflow.TFE_Py_TensorShapeSlice( input_values, non_neg_concat_dim) out_grads = array_ops.split(grad, sizes, non_neg_concat_dim) else: if constant_op.is_constant(concat_dim): # If concat_dim is a constant defined in a different context, # then we duplicate it in the current context to avoid passing it # through an Enter node. # This is a small optimization in general, but it is required when # compiling with XLA, as XLA needs the concat input to be folded into a # constant. grad_context = control_flow_util.GetOutputContext(grad.op) dim_context = control_flow_util.GetOutputContext(concat_dim.op) if dim_context != grad_context: value = tensor_util.constant_value(concat_dim) concat_dim = constant_op.constant(value=value, dtype=concat_dim.dtype) # Using mod here for convenience since concat_dim is already verified # in concat implementation to be within the allowed [-rank, rank) range. non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0]) # Get the inputs' tensor shapes sizes = _ExtractInputShapes(input_values) # The magic number of 16 was found through benchmarking a range of sizes # on CPUs and a Maxwell TitanX. A speedup was seen in a large majority of # cases when switching implementations at N=16, but it is possible that # there will be a small number of performance regressions. if len(sizes) > 16: # extract the size of each input along the concat dimension sizes = array_ops.squeeze( array_ops.slice(array_ops.stack(sizes, axis=1), [non_neg_concat_dim, 0], [1, -1])) out_grads = array_ops.split(grad, sizes, non_neg_concat_dim) else: offset = gen_array_ops.concat_offset(non_neg_concat_dim, sizes) for (begin, size) in zip(offset, sizes): out_grads.append(array_ops.slice(grad, begin, size)) elif isinstance(grad, ops.IndexedSlices): # Using mod here for convenience since concat_dim is already verified # in concat implementation to be within the allowed [-rank, rank) range. non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0]) concat_dim_static = tensor_util.constant_value(concat_dim) if concat_dim_static is None: raise ValueError("Can only compute IndexedSlices gradient with " "statically-known concat_dim") if concat_dim_static < 0: rank = tensor_util.constant_value(array_ops.rank(input_values[0])) if rank is None: raise ValueError( "Can only compute IndexedSlices gradient with " "negative concat_dim when first value rank is " "statically-known.") concat_dim_static %= rank # Get the inputs' tensor shapes sizes = [array_ops.shape(x) for x in input_values] if concat_dim_static > 0: # IndexedSlices, non_neg_concat_dim > 0. Each input gets IndexedSlices # gradients with all the indices, but with grad.values sliced accordingly. # This is like the Tensor case, except shape(grad.values)[0] is not equal # to shape(sizes[i])[0], since only a subset of the dim-0 values are # stored. mask, begin = _CreateDenseMaskAndBegin(sizes, non_neg_concat_dim) for size in sizes: new_values = array_ops.slice( grad.values, begin, array_ops.concat( [[-1], array_ops.slice(size, [1], [-1])], 0)) out_grads.append( ops.IndexedSlices(new_values, grad.indices, size)) # Lint complains begin = begin + ... begin = math_ops.add(begin, size * mask) else: # IndexedSlices, concat_dim == 0. Each input gets IndexedSlices gradients # only for the relevant indices. start = constant_op.constant(0, dtype=grad.indices.dtype) for size in sizes: size_concat_dim = array_ops.gather(size, non_neg_concat_dim) if size_concat_dim.dtype != grad.indices.dtype: size_concat_dim = math_ops.cast(size_concat_dim, dtype=grad.indices.dtype) end = start + size_concat_dim # Compute the 1-D Tensor of indices relevant for this input. indices_to_select = array_ops.squeeze(array_ops.where( math_ops.logical_and(grad.indices >= start, grad.indices < end)), axis=[1]) new_indices = array_ops.gather(grad.indices, indices_to_select) - start new_values = array_ops.gather(grad.values, indices_to_select) out_grads.append( ops.IndexedSlices(new_values, new_indices, size)) start = end else: raise TypeError("Expected Tensor or IndexedSlices, got %s" % type(grad)) return (out_grads + [None] if end_value_index <= dim_index else [None] + out_grads)
def AddForwardAccumulator(self, value, dead_branch=False): """Add an accumulator for each forward tensor that is needed in backprop. This is added to the forward loop at the first time when a tensor in the forward loop is used by backprop gradient computation loop. We create an accumulator that accumulates the value of tensor at each iteration. Called in the control flow context where gradients() is called. The pseudocode is: ``` acc = stack(); while (_pivot) { acc = stack_push(acc, value); } ``` We make sure that the stack push op in one iteration is executed before next iteration. This is achieved by adding a control edge from `forward_index.op.inputs[0].op` to the push op, and another control edge from the push op to either `forward_index.op` or `forward_sync`. Args: value: The source tensor in forward that is to be accumulated. dead_branch: True iff the tensor is on a dead branch of a cond. Returns: The stack that contains the accumulated history of the tensor. Raises: TypeError: For internal errors involving the value condition context. ValueError: If `value` is inside a XLA scope and a valid max size for the stack can't be found. """ # curr_ctxt is the context that tf.gradients was called in. with self._forward_index.graph.as_default(): curr_ctxt = ops.get_default_graph()._get_control_flow_context() # pylint: disable=protected-access with ops.control_dependencies(None): if curr_ctxt: curr_ctxt.Enter() with ops.colocate_with(value): # We only need to pass maximum_iterations to the stack if # we're inside an XLA context. if not util.IsInXLAContext(value.op): max_size = constant_op.constant(-1, dtypes.int32) else: max_size = _GetMaxSizeFromNestedMaximumIterations( value, self.forward_context) acc = gen_data_flow_ops.stack_v2( max_size=max_size, elem_type=value.dtype.base_dtype, name="f_acc") if curr_ctxt: curr_ctxt.Exit() # Make acc available in the forward context. enter_acc = self.forward_context.AddValue(acc) # Add the stack_push op in the context of value.op. swap_enabled = self.forward_context.swap_memory value_ctxt = util.GetOutputContext(value.op) if value_ctxt == self.forward_context: # value is not nested in the forward context. self.forward_context.Enter() push = gen_data_flow_ops.stack_push_v2( enter_acc, value, swap_memory=swap_enabled) self.forward_context.Exit() # Protect stack push and order it before forward_index. self.forward_index.op._add_control_input(push.op) else: # value is in a cond context within the forward context. if not isinstance(value_ctxt, control_flow_ops.CondContext): raise TypeError("value_ctxt is not a CondContext: %s" % value_ctxt) if dead_branch: # The special case for creating a zero tensor for a dead # branch of a switch. See _ControlFlowState.ZerosLike(). value_ctxt.outer_context.Enter() push = gen_data_flow_ops.stack_push_v2( enter_acc, value, swap_memory=swap_enabled) value_ctxt.outer_context.Exit() push.op._set_control_flow_context(value_ctxt) else: value_ctxt.Enter() push = gen_data_flow_ops.stack_push_v2( enter_acc, value, swap_memory=swap_enabled) value_ctxt.Exit() # Protect stack push and order it before forward_sync. self.forward_sync._add_control_input(push.op) # Order stack push after the successor of forward_index add_op = self.forward_index.op.inputs[0].op push.op._add_control_input(add_op) return acc