def register(self, candidate, name=None): """Registers a Python object "candidate" for the given "name". Args: candidate: The candidate object to add to the registry. name: An optional string specifying the registry key for the candidate. If None, candidate.__name__ will be used. Raises: KeyError: If same name is used twice. """ if not name: name = candidate.__name__ if name in self._registry: (filename, line_number, function_name, _) = (self._registry[name][_LOCATION_TAG]) raise KeyError( "Registering two %s with name '%s' !" "(Previous registration was in %s %s:%d)" % (self._name, name, function_name, filename, line_number)) logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name) # stack trace is [this_function, Register(), user_function,...] # so the user function is #2. stack = traceback.extract_stack() self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: stack[2]}
def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx): """Computes the theoretical Jacobian for dy/dx. Computes the theoretical Jacobian using the ops generated by compute_gradient(). Args: x: the tensor "x". x_shape: the dimensions of x as a tuple or an array of ints. x_data: a numpy parray as the input data for x dy: the tensor "dy". dy_shape: the dimensions of dy as a tuple or an array of ints. dx: Tensor or IndexedSlices representing dx Returns: A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows and "dy_size" columns where "x_size" is the number of elements in x and "dy_size" is the number of elements in dy. """ # Complex vectors are treated as vectors of twice as many reals. if x.dtype.is_complex: x_shape = tuple(x_shape) + (2, ) dy_factor = 2 if dy.dtype.is_complex else 1 # To compute the jacobian, we treat x and y as one-dimensional vectors. x_size = _product(x_shape) x_val_size = _product(x_shape[1:]) # This is used for sparse gradients dy_size = _product(dy_shape) * dy_factor jacobian = np.zeros((x_size, dy_size), dtype=x.dtype.real_dtype.as_numpy_dtype) # For each of the entry of dy, we set this to be 1 and # everything else to be 0 and compute the backprop -- this will give us one # one column of the Jacobian matrix. dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype) dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype) sess = ops.get_default_session() for col in range(dy_size): dy_data_flat[col] = 1 if isinstance(dx, ops.IndexedSlices): backprop_indices, backprop_values = sess.run( [dx.indices, dx.values], feed_dict={ x: x_data, dy: dy_data }) for i, v in zip(backprop_indices, backprop_values): r_begin = i * x_val_size r_end = r_begin + x_val_size jacobian[r_begin:r_end, col] += v.flat else: assert isinstance(dx, ops.Tensor), "dx = " + str(dx) backprop = sess.run(dx, feed_dict={x: x_data, dy: dy_data}) jacobian[:, col] = backprop.ravel().view(jacobian.dtype) dy_data_flat[col] = 0 logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian) return jacobian
def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx): """Computes the theoretical Jacobian for dy/dx. Computes the theoretical Jacobian using the ops generated by compute_gradient(). Args: x: the tensor "x". x_shape: the dimensions of x as a tuple or an array of ints. x_data: a numpy parray as the input data for x dy: the tensor "dy". dy_shape: the dimensions of dy as a tuple or an array of ints. dx: Tensor or IndexedSlices representing dx Returns: A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows and "dy_size" columns where "x_size" is the number of elements in x and "dy_size" is the number of elements in dy. """ # Complex vectors are treated as vectors of twice as many reals. if x.dtype.is_complex: x_shape = tuple(x_shape) + (2,) dy_factor = 2 if dy.dtype.is_complex else 1 # To compute the jacobian, we treat x and y as one-dimensional vectors. x_size = _product(x_shape) x_val_size = _product(x_shape[1:]) # This is used for sparse gradients dy_size = _product(dy_shape) * dy_factor jacobian = np.zeros((x_size, dy_size), dtype=x.dtype.real_dtype.as_numpy_dtype) # For each of the entry of dy, we set this to be 1 and # everything else to be 0 and compute the backprop -- this will give us one # one column of the Jacobian matrix. dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype) dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype) sess = ops.get_default_session() for col in range(dy_size): dy_data_flat[col] = 1 if isinstance(dx, ops.IndexedSlices): backprop_indices, backprop_values = sess.run( [dx.indices, dx.values], feed_dict={x: x_data, dy: dy_data}) for i, v in zip(backprop_indices, backprop_values): r_begin = i * x_val_size r_end = r_begin + x_val_size jacobian[r_begin:r_end, col] += v.flat else: assert isinstance(dx, ops.Tensor), "dx = " + str(dx) backprop = sess.run(dx, feed_dict={x: x_data, dy: dy_data}) jacobian[:, col] = backprop.ravel().view(jacobian.dtype) dy_data_flat[col] = 0 logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian) return jacobian
def _close_on_stop(self, sess, cancel_op, coord): """Close the queue when the Coordinator requests stop. Args: sess: A Session. cancel_op: The Operation to run. coord: Coordinator. """ coord.wait_for_stop() try: sess.run(cancel_op) except Exception as e: # Intentionally ignore errors from cancel_op. logging.vlog(1, "Ignored exception: %s", str(e))
def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta): """Computes the numeric Jacobian for dy/dx. Computes the numeric Jacobian by slightly perturbing the inputs and measuring the differences on the output. Args: x: the tensor "x". x_shape: the dimensions of x as a tuple or an array of ints. x_data: a numpy array as the input data for x y: the tensor "y". y_shape: the dimensions of y as a tuple or an array of ints. delta: the amount of perturbation we give to the input Returns: A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows and "y_size" columns where "x_size" is the number of elements in x and "y_size" is the number of elements in y. """ # To compute the jacobian, we treat x and y as one-dimensional vectors x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1) y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1) x_dtype = x.dtype.real_dtype.as_numpy_dtype y_dtype = y.dtype.real_dtype.as_numpy_dtype # Make sure we have the right types x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype) scale = np.asarray(1 / (2 * delta), dtype=y_dtype)[()] jacobian = np.zeros((x_size, y_size), dtype=x_dtype) # For each of the entry of x, we slightly perturbs this by adding and # subtracting a delta and then compute difference between the outputs. This # will give us one row of the Jacobian matrix. for row in range(x_size): x_pos = x_data.copy() x_pos.ravel().view(x_dtype)[row] += delta y_pos = y.eval(feed_dict={x: x_pos}) x_neg = x_data.copy() x_neg.ravel().view(x_dtype)[row] -= delta y_neg = y.eval(feed_dict={x: x_neg}) diff = scale * (y_pos - y_neg) jacobian[row, :] = diff.ravel().view(y_dtype) logging.vlog(1, "Numeric Jacobian =\n%s", jacobian) return jacobian
def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta): """Computes the numeric Jacobian for dy/dx. Computes the numeric Jacobian by slightly perturbing the inputs and measuring the differences on the output. Args: x: the tensor "x". x_shape: the dimensions of x as a tuple or an array of ints. x_data: a numpy array as the input data for x y: the tensor "y". y_shape: the dimensions of y as a tuple or an array of ints. delta: the amount of perturbation we give to the input Returns: A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows and "y_size" columns where "x_size" is the number of elements in x and "y_size" is the number of elements in y. """ # To compute the jacobian, we treat x and y as one-dimensional vectors x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1) y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1) x_dtype = x.dtype.real_dtype.as_numpy_dtype y_dtype = y.dtype.real_dtype.as_numpy_dtype # Make sure we have the right types x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype) scale = np.asarray(2 * delta, dtype=y_dtype)[()] jacobian = np.zeros((x_size, y_size), dtype=x_dtype) # For each of the entry of x, we slightly perturbs this by adding and # subtracting a delta and then compute difference between the outputs. This # will give us one row of the Jacobian matrix. for row in range(x_size): x_pos = x_data.copy() x_neg = x_data.copy() x_pos.ravel().view(x_dtype)[row] += delta y_pos = y.eval(feed_dict={x: x_pos}) x_neg.ravel().view(x_dtype)[row] -= delta y_neg = y.eval(feed_dict={x: x_neg}) diff = (y_pos - y_neg) / scale jacobian[row, :] = diff.ravel().view(y_dtype) logging.vlog(1, "Numeric Jacobian =\n%s", jacobian) return jacobian
def _run(self, sess, enqueue_op, coord=None): """Execute the enqueue op in a loop, close the queue in case of error. Args: sess: A Session. enqueue_op: The Operation to run. coord: Optional Coordinator object for reporting errors and checking for stop conditions. """ decremented = False try: while True: if coord and coord.should_stop(): break try: sess.run(enqueue_op) except errors.OutOfRangeError: # This exception indicates that a queue was closed. with self._lock: self._runs -= 1 decremented = True if self._runs == 0: try: sess.run(self._close_op) except Exception as e: # Intentionally ignore errors from close_op. logging.vlog(1, "Ignored exception: %s", str(e)) return except Exception as e: # This catches all other exceptions. if coord: coord.request_stop(e) else: logging.error("Exception in QueueRunner: %s", str(e)) with self._lock: self._exceptions_raised.append(e) raise finally: # Make sure we account for all terminations: normal or errors. if not decremented: with self._lock: self._runs -= 1
def _ComputeNumericJacobian(x, x_shape, x_data, y, y_shape, delta): """Computes the numeric Jacobian for dy/dx. Computes the numeric Japcobian by slightly perturbing the inputs and measuring the differences on the output. Args: x: the tensor "x". x_shape: the dimensions of x as a tuple or an array of ints. x_data: a numpy array as the input data for x y: the tensor "y". y_shape: the dimensions of y as a tuple or an array of ints. delta: the amount of perturbation we give to the input Returns: A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows and "y_size" columns where "x_size" is the number of elements in x and "y_size" is the number of elements in y. """ # To compute the jacobian, we treat x and y are one-dimensional vectors x_size = _Product(x_shape) y_size = _Product(y_shape) jacobian = np.zeros((x_size, y_size), dtype=x_data.dtype) # For each of the entry of x, we slightly perturbs this by adding and # subtracting a delta and then compute difference between the outputs. This # will give us one row of the Jacobian matrix. for row in range(0, x_size): x_pos = x_data.copy() x_pos.flat[row] += delta y_pos = y.eval(feed_dict={x: x_pos}) x_neg = x_data.copy() x_neg.flat[row] -= delta y_neg = y.eval(feed_dict={x: x_neg}) diff = (y_pos - y_neg) / (2 * delta) jacobian[row, :] = diff.reshape(y_size) logging.vlog(1, "Numeric Jacobian =\n%s", jacobian) return jacobian
def register(self, candidate, name=None): """Registers a Python object "candidate" for the given "name". Args: candidate: The candidate object to add to the registry. name: An optional string specifying the registry key for the candidate. If None, candidate.__name__ will be used. Raises: KeyError: If same name is used twice. """ if not name: name = candidate.__name__ if name in self._registry: (filename, line_number, function_name, _) = ( self._registry[name][_LOCATION_TAG]) raise KeyError("Registering two %s with name '%s' !" "(Previous registration was in %s %s:%d)" % (self._name, name, function_name, filename, line_number)) logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name) # stack trace is [this_function, Register(), user_function,...] # so the user function is #2. stack = traceback.extract_stack() self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: stack[2]}
def _AggregatedGrads(grads, op, loop_state, aggregation_method=None): """Get the aggregated gradients for op. Args: grads: The map of memoized gradients. op: The op to get gradients for. loop_state: An object for maintaining the state of the while loops in the graph. It is of type ControlFlowState. None if the graph contains no while loops. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of gradients, one per each output of `op`. If the gradients for a particular output is a list, this function aggregates it before returning. Raises: TypeError: if the incoming grads are not Tensors or IndexedSlices. ValueError: if the arguments are invalid. """ if aggregation_method is None: aggregation_method = AggregationMethod.DEFAULT if aggregation_method not in [ AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N ]: raise ValueError("Invalid aggregation_method specified %s." % aggregation_method) out_grads = _GetGrads(grads, op) for i, out_grad in enumerate(out_grads): if loop_state: if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)): assert control_flow_ops.IsLoopSwitch(op) continue # Grads have to be Tensors or IndexedSlices if not all([ isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad if g ]): raise TypeError("gradients have to be either all Tensors " "or all IndexedSlices") # Aggregate multiple gradients, and convert [] to None. if out_grad: if all([isinstance(g, ops.Tensor) for g in out_grad if g]): tensor_shape = _AccumulatorShape(out_grad) if len(out_grad) < 2: used = "nop" out_grads[i] = out_grad[0] elif (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N and len(out_grad) > 2 and tensor_shape.is_fully_defined()): # The benefit of using AccumulateN is that its inputs can be combined # in any order and this can allow the expression to be evaluated with # a smaller memory footprint. When used with gpu_allocator_retry, # it is possible to compute a sum of terms which are much larger than # total GPU memory. # AccumulateN can currently only be used if we know the shape for # an accumulator variable. If this is not known, or if we only have # 2 grads then we fall through to the "tree" case below. used = "accumulate_n" out_grads[i] = math_ops.accumulate_n(out_grad) elif aggregation_method in [ AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N ]: # Aggregate all gradients by doing pairwise sums: this may # reduce performance, but it can improve memory because the # gradients can be released earlier. # # TODO(vrv): Consider replacing this with a version of # tf.AddN() that eagerly frees its inputs as soon as they are # ready, so the order of this tree does not become a problem. used = "tree" with ops.name_scope(op.name + "_gradient_sum"): running_sum = out_grad[0] for grad in out_grad[1:]: running_sum = math_ops.add_n([running_sum, grad]) out_grads[i] = running_sum else: used = "add_n" out_grads[i] = math_ops.add_n(out_grad) logging.vlog(2, " _AggregatedGrads %d x %s using %s", len(out_grad), tensor_shape, used) else: out_grad = math_ops._as_indexed_slices_list( [g for g in out_grad if g]) out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad] # Form IndexedSlices out of the concatenated values and # indices. out_grads[i] = ops.IndexedSlices( array_ops.concat(0, [x.values for x in out_grad]), array_ops.concat(0, [x.indices for x in out_grad]), out_grad[0].dense_shape) else: out_grads[i] = [] return out_grads
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) if loop_state: # The "unused" exits of the loops are added to ys. As an example, # people often write: # v1, _ = While(p, b, [x1, x2]) # result = gradients(v1, x1) # The exit node of x2 is not included by the betweenness analysis. # But we need it if x2 is involved in computing v1. So we add it # back in backprop with a zeros_like gradient. loop_exits = loop_state.GetAllLoopExits() for y in loop_exits: if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set: if _IsFloat(y): # Floating-point outputs get a zero gradient. _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): if loop_state: loop_state.EnterGradWhileContext(op) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) grad_fn = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) # pylint: enable=protected-access if not is_func_call and any( out_grads) and op._id not in stop_ops: # pylint: enable=protected-access # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if (grad_fn or is_func_call) and any(out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if not out_grad and _IsFloat(op.outputs[i]): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = array_ops.zeros_like( op.outputs[i]) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access wrapped_op = op if loop_state: wrapped_op = loop_state.MakeWrapper(op) if is_func_call: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. f_in = [x for x in op.inputs] + out_grads f_types = [x.dtype for x in op.inputs] # pylint: disable=protected-access in_grads = _AsList( functional_ops._symbolic_gradient( f_in, f_types, op.type)) # pylint: enable=protected-access else: in_grads = _AsList( grad_fn(wrapped_op, *out_grads)) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( tuple(filter(None, in_grads))) > 1: in_grads = control_flow_ops.tuple(in_grads) logging.vlog(1, "Gradient for '" + op.name + "'") logging.vlog(1, " in --> %s", ", ".join([x.name for x in out_grads if x])) logging.vlog(1, " out --> %s", ", ".join([x.name for x in in_grads if x])) else: # If no grad_fn is defined or none of out_grads is available, # just propagates a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad: _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op) # update pending count for the inputs of op. # pylint: disable=protected-access for x in op.inputs: pending_count[x.op._id] -= 1 ready = (pending_count[x.op._id] == 0) if loop_state and not ready: ready = (pending_count[x.op._id] > 0 and control_flow_ops.IsLoopSwitch(x.op)) if ready: queue.append(x.op) for x in op.control_inputs: pending_count[x._id] -= 1 if pending_count[x._id] is 0: queue.append(x) # pylint: enable=protected-access return [_GetGrad(grads, x) for x in xs]
def _AggregatedGrads(grads, op, has_control_flow, aggregation_method=None): """Get the aggregated gradients for op. Args: grads: The map of memoized gradients. op: The op to get gradients for. has_control_flow: True iff the graph contains control flow ops. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of gradients, one per each output of `op`. If the gradients for a particular output is a list, this function aggregates it before returning. Raises: TypeError: if the incoming grads are not Tensors or IndexedSlices. ValueError: if the arguments are invalid. """ if aggregation_method is None: aggregation_method = AggregationMethod.DEFAULT if aggregation_method not in [AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N]: raise ValueError("Invalid aggregation_method specified.") out_grads = _GetGrads(grads, op) for i, out_grad in enumerate(out_grads): if has_control_flow: if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)): assert op.type == "Switch" continue # Grads have to be Tensors or IndexedSlices if not all([isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad if g]): raise TypeError("gradients have to be either all Tensors " "or all IndexedSlices") # Aggregate multiple gradients, and convert [] to None. if out_grad: if all([isinstance(g, ops.Tensor) for g in out_grad if g]): tensor_shape = _AccumulatorShape(out_grad) if len(out_grad) < 2: used = "nop" out_grads[i] = out_grad[0] elif (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N and len(out_grad) > 2 and tensor_shape.is_fully_defined()): # The benefit of using AccumulateN is that its inputs can be combined # in any order and this can allow the expression to be evaluated with # a smaller memory footprint. When used with gpu_allocator_retry, # it is possible to compute a sum of terms which are much larger than # total GPU memory. # AccumulateN can currently only be used if we know the shape for # an accumulator variable. If this is not known, or if we only have # 2 grads then we fall through to the "tree" case below. used = "accumulate_n" out_grads[i] = math_ops.accumulate_n(out_grad) elif aggregation_method in [AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N ]: # Aggregate all gradients by doing pairwise sums: this may # reduce performance, but it can improve memory because the # gradients can be released earlier. # # TODO(vrv): Consider replacing this with a version of # tf.AddN() that eagerly frees its inputs as soon as they are # ready, so the order of this tree does not become a problem. used = "tree" with ops.name_scope(op.name + "_gradient_sum"): running_sum = out_grad[0] for grad in out_grad[1:]: running_sum = math_ops.add_n([running_sum, grad]) out_grads[i] = running_sum else: used = "add_n" out_grads[i] = math_ops.add_n(out_grad) logging.vlog(2, " _AggregatedGrads %d x %s using %s", len(out_grad), tensor_shape, used) else: out_grad = math_ops._as_indexed_slices_list([g for g in out_grad if g]) out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad] # Form IndexedSlices out of the concatenated values and # indices. out_grads[i] = ops.IndexedSlices( array_ops.concat(0, [x.values for x in out_grad]), array_ops.concat(0, [x.indices for x in out_grad]), out_grad[0].dense_shape) else: out_grads[i] = [] return out_grads
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, has_control_flow = _PendingCount(ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: # pylint: disable=protected-access to_ops_set.add(op._id) queue.append(op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): if has_control_flow: control_flow_ops.EnterGradWhileContext(op) out_grads = _AggregatedGrads(grads, op, has_control_flow, aggregation_method) grad_fn = None if any(out_grads) and op._id not in stop_ops: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if grad_fn and any(out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not out_grad and dtypes.as_dtype(op.outputs[i].dtype).base_dtype in (dtypes.float32, dtypes.float64)): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. out_grads[i] = array_ops.zeros_like(op.outputs[i]) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access op_wrapper = op if has_control_flow: op_wrapper = control_flow_ops.MakeWrapper(op) in_grads = _AsList(grad_fn(op_wrapper, *out_grads)) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len(in_grads) > 1: in_grads = control_flow_ops.tuple(in_grads) logging.vlog(1, "Gradient for '" + op.name + "'") logging.vlog(1, " in --> %s", ", ".join([x.name for x in out_grads if x])) logging.vlog(1, " out --> %s", ", ".join([x.name for x in in_grads if x])) else: # If no grad_fn is defined or none of out_grads is available, # just propagates a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad: _SetGrad(grads, t_in, in_grad) if has_control_flow: control_flow_ops.ExitGradWhileContext(op) # update pending count for the inputs of op. for x in op.inputs: pending_count[x.op._id] -= 1 ready = (pending_count[x.op._id] == 0) if has_control_flow and not ready: ready = (pending_count[x.op._id] > 0 and control_flow_ops.IsLoopSwitch(x.op)) if ready: queue.append(x.op) for x in op.control_inputs: pending_count[x._id] -= 1 if pending_count[x._id] is 0: queue.append(x) return [_GetGrad(grads, x) for x in xs]
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) if loop_state: # The "unused" exits of the loops are added to ys. As an example, # people often write: # v1, _ = While(p, b, [x1, x2]) # result = gradients(v1, x1) # The exit node of x2 is not included by the betweenness analysis. # But we need it if x2 is involved in computing v1. So we add it # back in backprop with a zeros_like gradient. loop_exits = loop_state.GetAllLoopExits() for y in loop_exits: if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set: if _IsFloat(y): # Floating-point outputs get a zero gradient. _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) if not is_func_call and any( isinstance(g, ops.Tensor) or g for g in out_grads) and ( op._id not in stop_ops): # pylint: enable=protected-access # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and any( isinstance(g, ops.Tensor) or g for g in out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and _IsFloat(op.outputs[i]): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if is_func_call: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. f_in = [x for x in op.inputs] + out_grads f_types = [x.dtype for x in op.inputs] # pylint: disable=protected-access in_grads = _AsList(functional_ops._symbolic_gradient( f_in, f_types, op.type)) # pylint: enable=protected-access else: in_grads = _AsList(grad_fn(op, *out_grads)) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: in_grads = control_flow_ops.tuple(in_grads) logging.vlog(1, "Gradient for '" + op.name + "'") def _FilterGrad(x): if x is None: return False if isinstance(x, (list, tuple)): return bool(x) else: return True logging.vlog(1, " in --> %s", ", ".join([x.name for x in out_grads if _FilterGrad(x)])) logging.vlog(1, " out --> %s", ", ".join([x.name for x in in_grads if _FilterGrad(x)])) else: # If no grad_fn is defined or none of out_grads is available, # just propagates a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad is not None: _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # update pending count for the inputs of op. # pylint: disable=protected-access for x in op.inputs: pending_count[x.op._id] -= 1 ready = (pending_count[x.op._id] == 0) if loop_state and not ready: ready = (pending_count[x.op._id] > 0 and control_flow_ops.IsLoopSwitch(x.op)) if ready: queue.append(x.op) for x in op.control_inputs: pending_count[x._id] -= 1 if pending_count[x._id] is 0: queue.append(x) # pylint: enable=protected-access return [_GetGrad(grads, x) for x in xs]