def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False): """""" # Error checking if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " + "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if var_list is None: var_list = variables.trainable_variables() for x_tm1 in var_list: if not isinstance(x_tm1, variables.Variable): raise TypeError("Argument is not a tf.Variable: %s" % x_tm1) if not var_list: raise ValueError("No variables to optimize") # The actual stuff var_refs = [x_tm1.ref() for x_tm1 in var_list] grads = gradients.gradients(loss, var_refs, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([x_tm1 for g_t, x_tm1 in grads_and_vars if g_t is not None]) return grads_and_vars
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKey.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. """ if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def in_grad_function(): with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _AsList(grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. f_in = [x for x in op.inputs] + out_grads f_types = [x.dtype for x in op.inputs] # pylint: disable=protected-access in_grads = _AsList( functional_ops._symbolic_gradient( f_in, f_types, op.type)) # pylint: enable=protected-access _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) in_grads = [ x if x is not None else tf.zeros( tf.shape(op.inputs[i]), dtype=op.inputs[i].dtype) for i, x in enumerate(in_grads) ] return in_grads
def testTuplesOfTuplesAreStreamed(self): with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): pa = array_ops.placeholder(np.int64, [2, 2], name="a") pb = array_ops.placeholder(np.int64, [2, 2], name="b") pc = array_ops.placeholder(np.int64, [2, 2], name="c") c = control_flow_ops.tuple((pa + pc, pb + pc)) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(report) in0 = np.full((2, 2), 7) in1 = np.full((2, 2), 6) in2 = np.full((2, 2), 5) fd = { pa: in0, pb: in1, pc: in2, } out = sess.run(c, fd) self.assertEqual(len(out), 2) self.assertAllClose(out, (np.full((2, 2), 12), np.full( (2, 2), 11))) rep = sess.run(report) io_evts = tu.extract_all_io_events(rep) # No io_events implies the data was streamed self.assertEqual(len(list(io_evts)), 0)
def testIndexedSlices(self): for v1_first in [True, False]: with self.test_session(): v1 = tf.Variable(np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(np.float32)) v1_at_1 = tf.IndexedSlices( control_flow_ops.with_dependencies([v1.initializer], v1.ref()), tf.constant([1]) ) v2 = tf.Variable(np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(np.float32)) v2_at_1 = tf.IndexedSlices( control_flow_ops.with_dependencies([v2.initializer], v2.ref()), tf.constant([1]) ) st1, st2 = control_flow_ops.tuple([v1_at_1, v2_at_1]) g1 = tf.gather(st1.values, st1.indices) g2 = tf.gather(st2.values, st2.indices) # v1 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v1.eval() # v2 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v2.eval() if v1_first: # Getting g1 initializes v2. self.assertAllClose([[10.0, 11.0]], g1.eval()) self.assertAllClose([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]], v2.eval()) else: # Getting g2 initializes v1. self.assertAllClose([[10.1, 11.1]], g2.eval()) self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]], v1.eval())
def testTensors(self): for v1_first in [True, False]: with self.test_session(): v1 = tf.Variable([1.0]) add1 = tf.add( control_flow_ops.with_dependencies([v1.initializer], v1.ref()), 2.0) v2 = tf.Variable([10.0]) add2 = tf.add( control_flow_ops.with_dependencies([v2.initializer], v2.ref()), 20.0) t1, _, t2 = control_flow_ops.tuple([add1, None, add2]) # v1 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v1.eval() # v2 is not initialized. with self.assertRaisesOpError("Attempting to use uninitialized value"): v2.eval() if v1_first: # Getting t1 initializes v2. self.assertAllClose([3.0], t1.eval()) self.assertAllClose([10.0], v2.eval()) else: # Getting t2 initializes v1. self.assertAllClose([30.0], t2.eval()) self.assertAllClose([1.0], v1.eval())
def testTuplesOfTuplesAreStreamed(self): with self.session() as sess: with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): pa = array_ops.placeholder(np.int64, [2, 2], name="a") pb = array_ops.placeholder(np.int64, [2, 2], name="b") pc = array_ops.placeholder(np.int64, [2, 2], name="c") c = control_flow_ops.tuple((pa + pc, pb + pc)) report = tu.ReportJSON(self, sess) report.reset() in0 = np.full((2, 2), 7) in1 = np.full((2, 2), 6) in2 = np.full((2, 2), 5) fd = { pa: in0, pb: in1, pc: in2, } out = sess.run(c, fd) self.assertEqual(len(out), 2) self.assertAllClose(out, (np.full((2, 2), 12), np.full((2, 2), 11))) report.parse_log() report.assert_host_to_device_event_names( [], "No io events implies the data was streamed") report.assert_device_to_host_event_names( [], "No io events implies the data was streamed")
def grad_fn(inputs, variables, outputs, output_grads): """Recompute outputs for gradient computation.""" del outputs # Recompute outputs with framework_ops.control_dependencies(output_grads): if use_data_dep_: inputs = _force_data_dependency(output_grads, inputs) with contrib_framework_ops.arg_scope(cached_arg_scope[0]): with variable_scope.variable_scope(cached_vs[0], reuse=True): outputs = fn(*inputs) if not (isinstance(outputs, list) or isinstance(outputs, tuple)): outputs = [outputs] outputs = list(outputs) grads = gradients_impl.gradients(outputs, inputs + variables, output_grads) if tupleize_grads: if use_data_dep_: grads = _tuple_with_data_dep(grads) else: grads = control_flow_ops.tuple(grads) grad_inputs = grads[:len(inputs)] grad_vars = grads[len(inputs):] return grad_inputs, grad_vars
def _update(self, var, fn, *args, **kwargs): # TODO(jhseu): Consider supporting grouped==False. assert isinstance(var, values.TPUMirroredVariable) if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access return fn(var, *args, **kwargs) # Otherwise, we revert to MirroredStrategy behavior and update each variable # directly. updates = {} for d, v in var._index.items(): # pylint: disable=protected-access name = "update_%d" % self._device_index.get(d) with ops.device(d), distribute_lib.UpdateContext( d), ops.name_scope(name): # If args and kwargs are not mirrored, the value is returned as is. updates[d] = fn(v, *values.select_device_mirrored(d, args), **values.select_device_mirrored(d, kwargs)) # Make a single control dependency to keep the variables mirrored. If one # assignment is fetched, then run all assignments. sorted_keys = sorted(updates.keys()) update_tuple = control_flow_ops.tuple( [updates[d] for d in sorted_keys]) for i, d in enumerate(sorted_keys): updates[d] = update_tuple[i] return values.regroup(updates, values.Mirrored)
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list of variables.Variable to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKey.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. Returns: A list of (gradient, variable) pairs. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. """ if gate_gradients not in [ Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH ]: raise ValueError( "gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if var_list is None: var_list = variables.trainable_variables() for var in var_list: if not isinstance(var, variables.Variable): raise TypeError("Argument is not a variables.Variable: %s" % var) if not var_list: raise ValueError("No variables to optimize") grads = gradients.gradients( loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes( [v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def compute_gradient_Hs(self, y_pred, var_list=None, gate_gradients=Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Computes the EKF linearize measurment matrix H for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (H, variable) pairs where "H" is the derivative of the measurment function h with respect to "variable" for "variable". Args: y_pred: The prediction of the network NOTE THAT THIS SHOULD BE A TENSOR AND NOT A SCALAR LIKE IN OTHER OPTIMIZERS. var_list: Optional list of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKey.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (H, variable) pairs. Variable is always present, but H can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. """ if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([y_pred]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) if not var_list: raise ValueError("No variables to optimize.") Hs = self.calc_H(gen_array_ops.reshape(y_pred, [-1]), var_list, self.y_dim, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: Hs = control_flow_ops.tuple(Hs) grads_and_vars = list(zip(Hs, var_list)) self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def _rev_layer_forward(xs, f, g, f_side_input, g_side_input, gate_outputs=False): """Forward for 1 reversible layer.""" x1, x2 = xs y1 = x1 + (f(x2, f_side_input) if f_side_input else f(x2)) y2 = x2 + (g(y1, g_side_input) if g_side_input else g(y1)) if gate_outputs: return control_flow_ops.tuple([y1, y2]) else: return (y1, y2)
def body_wrapper(*inputs): """Wrapper around `body` that handles infeed queues and control deps.""" inputs = list(inputs) # Discards the dummy output added for arity-0 loops. if input_arity == 0: inputs = [] # Runs `body` with the dequeue_ops appended. if infeed_queue: dequeue_ops = infeed_queue._dequeue() else: dequeue_ops = [] body_args, body_kwargs = _body_arguments(dequeue_ops) outputs = body(*(inputs + body_args), **body_kwargs) # If the computation only returned one value, make it a tuple. if not isinstance(outputs, (list, tuple)): outputs = (outputs, ) outputs = [ o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o) for o in outputs ] # Separates the returned Operations and Tensors. output_operations = [ o for o in outputs if isinstance(o, ops.Operation) ] output_tensors = [ o for o in outputs if not isinstance(o, ops.Operation) ] if outputs != output_tensors + output_operations: raise ValueError( "IPU loop body must return zero or more Tensor values " "followed by zero or more Operations.") output_types = [op.dtype for op in output_tensors] if input_types != output_types: raise TypeError( "Mismatch between input types and output types for loop " "body: {} vs {}.".format(input_types, output_types)) # Add a dummy output, if needed. if not output_tensors: output_tensors = [array_ops.constant(0)] if output_operations: output_tensors = control_flow_ops.tuple( output_tensors, control_inputs=output_operations) return output_tensors[0] if len( output_tensors) == 1 else output_tensors
def _recomputing_grad_fn(compute_fn, original_args, original_vars, output_grads, grad_fn_variables, use_data_dep, tupleize_grads, arg_scope, var_scope, has_is_recompute_kwarg): """Grad fn for recompute_grad.""" variables = grad_fn_variables or [] # Identity ops around the inputs ensures correct gradient graph-walking. inputs = [array_ops.identity(x) for x in list(original_args)] # Recompute outputs # Use a control dependency to ensure that the recompute is not eliminated by # CSE and that it happens on the backwards pass. ctrl_dep_grads = [g for g in output_grads if g is not None] with framework_ops.control_dependencies(ctrl_dep_grads): if use_data_dep: inputs = _force_data_dependency(output_grads, inputs) # Re-enter scopes with arg_scope_lib.arg_scope(arg_scope): with variable_scope.variable_scope(var_scope, reuse=True): # Re-call the function and ensure that the touched variables are the # same as in the first call. with backprop.GradientTape() as tape: fn_kwargs = {} if has_is_recompute_kwarg: fn_kwargs["is_recomputing"] = True outputs = compute_fn(*inputs, **fn_kwargs) recompute_vars = set( _as_ref(v) for v in tape.watched_variables()) if original_vars != recompute_vars: raise ValueError(_WRONG_VARS_ERR) if not isinstance(outputs, (list, tuple)): outputs = [outputs] outputs = list(outputs) # Compute gradients grads = _gradients( outputs, inputs + variables, output_grads, stop_gradients=inputs) if tupleize_grads: if use_data_dep: grads = _tuple_with_data_dep(grads) else: grads = control_flow_ops.tuple(grads) grad_inputs = grads[:len(inputs)] grad_vars = grads[len(inputs):] return grad_inputs, grad_vars
def _recomputing_grad_fn(compute_fn, original_args, original_vars, output_grads, grad_fn_variables, use_data_dep, tupleize_grads, arg_scope, var_scope, has_is_recompute_kwarg): """Grad fn for recompute_grad.""" variables = grad_fn_variables or [] # Identity ops around the inputs ensures correct gradient graph-walking. inputs = [array_ops.identity(x) for x in list(original_args)] # Recompute outputs # Use a control dependency to ensure that the recompute is not eliminated by # CSE and that it happens on the backwards pass. ctrl_dep_grads = [g for g in output_grads if g is not None] with framework_ops.control_dependencies(ctrl_dep_grads): if use_data_dep: inputs = _force_data_dependency(output_grads, inputs) # Re-enter scopes with contrib_framework_ops.arg_scope(arg_scope): with variable_scope.variable_scope(var_scope, reuse=True): # Re-call the function and ensure that the touched variables are the # same as in the first call. with backprop.GradientTape() as tape: fn_kwargs = {} if has_is_recompute_kwarg: fn_kwargs["is_recomputing"] = True outputs = compute_fn(*inputs, **fn_kwargs) recompute_vars = set(tape.watched_variables()) if original_vars != recompute_vars: raise ValueError(_WRONG_VARS_ERR) if not isinstance(outputs, (list, tuple)): outputs = [outputs] outputs = list(outputs) # Compute gradients grads = gradients_impl.gradients(outputs, inputs + variables, output_grads) if tupleize_grads: if use_data_dep: grads = _tuple_with_data_dep(grads) else: grads = control_flow_ops.tuple(grads) grad_inputs = grads[:len(inputs)] grad_vars = grads[len(inputs):] return grad_inputs, grad_vars
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list of tf.Variable to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKey.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. Returns: A list of (gradient, variable) pairs. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. """ if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if var_list is None: var_list = variables.trainable_variables() for var in var_list: if not isinstance(var, variables.Variable): raise TypeError("Argument is not a tf.Variable: %s" % var) if not var_list: raise ValueError("No variables to optimize") grads = gradients.gradients( loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP): """Compute gradients of "loss" for the variables in "var_list". This is the first part of minimize(). It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a Tensor, a IndexedSlices, or None if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list of variables.Variable to update to minimize "loss". Defaults to the list of variables collected in the graph under the key GraphKey.TRAINABLE_VARIABLES. gate_gradients: How to gate the computation of gradients. Can be GATE_NONE, GATE_OP, or GATE_GRAPH. Returns: A list of (gradient, variable) pairs. Raises: TypeError: If var_list contains anything else than variables.Variable. ValueError: If some arguments are invalid. """ if gate_gradients not in [ Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH ]: raise ValueError( "gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if var_list is None: var_list = variables.trainable_variables() for var in var_list: if not isinstance(var, variables.Variable): raise TypeError("Argument is not a variables.Variable: %s" % var) grads = gradients.gradients( loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP)) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes( [v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def _rev_layer_backward(ys, grad_ys, f, g, f_vars, f_side_input, g_vars, g_side_input): """Backprop for 1 layer.""" y1, y2 = ys grad_y1, grad_y2 = grad_ys # Reconstruct intermediates and inputs (x1, x2) # stop_gradients required on fn inputs to prevent infinite recursion into this # grad function on the calls to gradients. y1_stop = array_ops.stop_gradient(y1) g_side_input = [array_ops.stop_gradient(t) for t in g_side_input] gy1 = g(y1_stop, g_side_input) if g_side_input else g(y1_stop) x2 = y2 - gy1 x2_stop = array_ops.stop_gradient(x2) f_side_input = [array_ops.stop_gradient(t) for t in f_side_input] fx2 = f(x2_stop, f_side_input) if f_side_input else f(x2_stop) x1 = y1 - fx2 # Compute gradients wrt to inputs # dL/dy2 * dG(y1)/y1 grad_gy1_y2 = gradients_impl.gradients(gy1, y1_stop, grad_y2)[0] grad_x1 = grad_y1 + grad_gy1_y2 grad_x2 = ( gradients_impl.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 + gradients_impl.gradients(fx2, x2_stop, grad_gy1_y2)[0]) # Compute gradients wrt to vars and side inputs in f and g grads1 = gradients_impl.gradients(gy1, g_vars + g_side_input, grad_y2) grad_g_vars, grad_g_side = grads1[:len(g_vars)], grads1[len(g_vars):] grads2 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_y1) grad_f_y1, grad_f_side1 = grads2[:len(f_vars)], grads2[len(f_vars):] grads3 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_gy1_y2) grad_f_y2, grad_f_side2 = grads3[:len(f_vars)], grads3[len(f_vars):] grad_f_vars = _acc_grads(grad_f_y1, grad_f_y2) grad_f_side = _acc_grads(grad_f_side1, grad_f_side2) # Put returns in a tuple to ensure a constant memory budget (i.e. don't want # the subsequent layer to start computing and consuming memory based on a # subset of these values). outputs = ((x1, x2), (grad_x1, grad_x2), (grad_f_vars, grad_f_side), (grad_g_vars, grad_g_side)) tupled = control_flow_ops.tuple(nest.flatten(outputs)) return nest.pack_sequence_as(outputs, tupled)
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP): """Compute gradients of "loss" for the variables in "var_list". This is the first part of minimize(). It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a Tensor, a IndexedSlices, or None if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list of variables.Variable to update to minimize "loss". Defaults to the list of variables collected in the graph under the key GraphKey.TRAINABLE_VARIABLES. gate_gradients: How to gate the computation of gradients. Can be GATE_NONE, GATE_OP, or GATE_GRAPH. Returns: A list of (gradient, variable) pairs. Raises: TypeError: If var_list contains anything else than variables.Variable. ValueError: If some arguments are invalid. """ if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if var_list is None: var_list = variables.trainable_variables() for var in var_list: if not isinstance(var, variables.Variable): raise TypeError("Argument is not a variables.Variable: %s" % var) grads = gradients.gradients( loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP)) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def testIndexedSlices(self): for v1_first in [True, False]: with self.test_session(): v1 = tf.Variable( np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(np.float32)) v1_at_1 = tf.IndexedSlices( control_flow_ops.with_dependencies([v1.initializer], v1), tf.constant([1])) v2 = tf.Variable( np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(np.float32)) v2_at_1 = tf.IndexedSlices( control_flow_ops.with_dependencies([v2.initializer], v2), tf.constant([1])) st1, st2 = control_flow_ops.tuple([v1_at_1, v2_at_1]) g1 = tf.gather(st1.values, st1.indices) g2 = tf.gather(st2.values, st2.indices) # v1 is not initialized. with self.assertRaisesOpError( "Attempting to use uninitialized value"): v1.eval() # v2 is not initialized. with self.assertRaisesOpError( "Attempting to use uninitialized value"): v2.eval() if v1_first: # Getting g1 initializes v2. self.assertAllClose([[10.0, 11.0]], g1.eval()) self.assertAllClose( [[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]], v2.eval()) else: # Getting g2 initializes v1. self.assertAllClose([[10.1, 11.1]], g2.eval()) self.assertAllClose( [[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]], v1.eval())
def _grad_fn(output_grads, variables=None): """Recompute outputs for gradient computation.""" variables = variables or [] if original_vars: assert variables, ( "Fn created variables but the variables were not " "passed to the gradient fn.") if set(variables) != original_vars: raise ValueError(_WRONG_VARS_ERR) inputs = [array_ops.identity(x) for x in list(args)] # Recompute outputs with framework_ops.control_dependencies(output_grads): if use_data_dep_: inputs = _force_data_dependency(output_grads, inputs) with contrib_framework_ops.arg_scope(arg_scope): with variable_scope.variable_scope(vs, reuse=True): with backprop.GradientTape() as tape: fn_kwargs = {} if has_is_recompute_kwarg: fn_kwargs["is_recomputing"] = True outputs = fn(*inputs, **fn_kwargs) recompute_vars = set(tape.watched_variables()) if original_vars != recompute_vars: raise ValueError(_WRONG_VARS_ERR) if not isinstance(outputs, (list, tuple)): outputs = [outputs] outputs = list(outputs) grads = gradients_impl.gradients(outputs, inputs + variables, output_grads) if tupleize_grads: if use_data_dep_: grads = _tuple_with_data_dep(grads) else: grads = control_flow_ops.tuple(grads) grad_inputs = grads[:len(inputs)] grad_vars = grads[len(inputs):] return grad_inputs, grad_vars
def _grad_fn(output_grads, variables=None): """Recompute outputs for gradient computation.""" variables = variables or [] if original_vars: assert variables, ("Fn created variables but the variables were not " "passed to the gradient fn.") if set(variables) != original_vars: raise ValueError(_WRONG_VARS_ERR) inputs = [array_ops.identity(x) for x in list(args)] # Recompute outputs with framework_ops.control_dependencies(output_grads): if use_data_dep_: inputs = _force_data_dependency(output_grads, inputs) with contrib_framework_ops.arg_scope(arg_scope): with variable_scope.variable_scope(vs, reuse=True): with backprop.GradientTape() as tape: fn_kwargs = {} if has_is_recompute_kwarg: fn_kwargs["is_recomputing"] = True outputs = fn(*inputs, **fn_kwargs) recompute_vars = set(tape.watched_variables()) if original_vars != recompute_vars: raise ValueError(_WRONG_VARS_ERR) if not isinstance(outputs, (list, tuple)): outputs = [outputs] outputs = list(outputs) grads = gradients_impl.gradients(outputs, inputs + variables, output_grads) if tupleize_grads: if use_data_dep_: grads = _tuple_with_data_dep(grads) else: grads = control_flow_ops.tuple(grads) grad_inputs = grads[:len(inputs)] grad_vars = grads[len(inputs):] return grad_inputs, grad_vars
def grad_fn(*output_grads, **kwargs): """Recompute outputs for gradient computation.""" variables = [] if original_vars: variables = kwargs["variables"] if set(variables) != original_vars: raise ValueError(_WRONG_VARS_ERR) del kwargs inputs = list(args) # Recompute outputs with framework_ops.control_dependencies(output_grads): if use_data_dep_: inputs = _force_data_dependency(output_grads, inputs) with contrib_framework_ops.arg_scope(arg_scope): with variable_scope.variable_scope(vs, reuse=True): with backprop.GradientTape() as tape: fn_kwargs = {} if has_is_recompute_kwarg: fn_kwargs["is_recomputing"] = True outputs = fn(*inputs, **fn_kwargs) recompute_vars = set(tape.watched_variables()) if original_vars != recompute_vars: raise ValueError(_WRONG_VARS_ERR) if not (isinstance(outputs, list) or isinstance(outputs, tuple)): outputs = [outputs] outputs = list(outputs) grads = gradients_impl.gradients(outputs, inputs + variables, output_grads) if tupleize_grads: if use_data_dep_: grads = _tuple_with_data_dep(grads) else: grads = control_flow_ops.tuple(grads) grad_inputs = grads[:len(inputs)] grad_vars = grads[len(inputs):] return grad_inputs, grad_vars
def compute_gradients_with_injected_short_circuiting(loss, var_list=None, gate_gradients=optimizer.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, should_stop_queue=None, global_step=None, grad_loss=None): assert should_stop_queue is not None assert global_step is not None if gate_gradients not in [optimizer.Optimizer.GATE_NONE, optimizer.Optimizer.GATE_OP, optimizer.Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) assert_valid_dtypes([loss]) if grad_loss is not None: assert_valid_dtypes([grad_loss]) if var_list is None: var_list = variables.trainable_variables() for var in var_list: if not isinstance(var, variables.Variable): raise TypeError("Argument is not a tf.Variable: %s" % var) if not var_list: raise ValueError("No variables to optimize") var_refs = [v._ref() for v in var_list] grads = gradients.gradients_short_circuited( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == optimizer.Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, should_stop_queue=should_stop_queue, global_step=global_step) if gate_gradients == optimizer.Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) assert_valid_dtypes([v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def _update(self, var, fn, *args, **kwargs): # TODO(jhseu): Consider supporting grouped==False. assert isinstance(var, values.TPUMirroredVariable) if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access return fn(var, *args, **kwargs) # Otherwise, we revert to MirroredStrategy behavior and update each variable # directly. updates = {} for d, v in var._index.items(): # pylint: disable=protected-access name = "update_%d" % self._device_index.get(d) with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name): # If args and kwargs are not mirrored, the value is returned as is. updates[d] = fn(v, *values.select_device_mirrored(d, args), **values.select_device_mirrored(d, kwargs)) # Make a single control dependency to keep the variables mirrored. If one # assignment is fetched, then run all assignments. sorted_keys = sorted(updates.keys()) update_tuple = control_flow_ops.tuple([updates[d] for d in sorted_keys]) for i, d in enumerate(sorted_keys): updates[d] = update_tuple[i] return values.regroup(updates, values.Mirrored)
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) if loop_state: # The "unused" exits of the loops are added to ys. As an example, # people often write: # v1, _ = While(p, b, [x1, x2]) # result = gradients(v1, x1) # The exit node of x2 is not included by the betweenness analysis. # But we need it if x2 is involved in computing v1. So we add it # back in backprop with a zeros_like gradient. loop_exits = loop_state.GetAllLoopExits() for y in loop_exits: if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set: if _IsFloat(y): # Floating-point outputs get a zero gradient. _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): if loop_state: loop_state.EnterGradWhileContext(op) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) grad_fn = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) # pylint: enable=protected-access if not is_func_call and any( out_grads) and op._id not in stop_ops: # pylint: enable=protected-access # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if (grad_fn or is_func_call) and any(out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if not out_grad and _IsFloat(op.outputs[i]): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = array_ops.zeros_like( op.outputs[i]) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access wrapped_op = op if loop_state: wrapped_op = loop_state.MakeWrapper(op) if is_func_call: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. f_in = [x for x in op.inputs] + out_grads f_types = [x.dtype for x in op.inputs] # pylint: disable=protected-access in_grads = _AsList( functional_ops._symbolic_gradient( f_in, f_types, op.type)) # pylint: enable=protected-access else: in_grads = _AsList( grad_fn(wrapped_op, *out_grads)) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( tuple(filter(None, in_grads))) > 1: in_grads = control_flow_ops.tuple(in_grads) logging.vlog(1, "Gradient for '" + op.name + "'") logging.vlog(1, " in --> %s", ", ".join([x.name for x in out_grads if x])) logging.vlog(1, " out --> %s", ", ".join([x.name for x in in_grads if x])) else: # If no grad_fn is defined or none of out_grads is available, # just propagates a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad: _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op) # update pending count for the inputs of op. # pylint: disable=protected-access for x in op.inputs: pending_count[x.op._id] -= 1 ready = (pending_count[x.op._id] == 0) if loop_state and not ready: ready = (pending_count[x.op._id] > 0 and control_flow_ops.IsLoopSwitch(x.op)) if ready: queue.append(x.op) for x in op.control_inputs: pending_count[x._id] -= 1 if pending_count[x._id] is 0: queue.append(x) # pylint: enable=protected-access return [_GetGrad(grads, x) for x in xs]
def body_wrapper(*inputs): """Wrapper around `body` that handles infeed queues and control deps.""" inputs = list(inputs) # Discards the dummy output added for arity-0 loops. if input_arity == 0: inputs = [] # Runs `body` with the dequeue_ops appended. if infeed_queue: number_of_shards = tpu_function.get_tpu_context().number_of_shards if number_of_shards is None: raise ValueError("Can't build training loop with infeed when there is " "no tpu_shard_context. Are you building a loop or " "graph directly rather than from inside tpu.rewrite, " "tpu.batch_parallel, tpu.shard, or tpu.replicate?") infeed_queue.set_number_of_shards(number_of_shards) dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()] else: dequeue_ops = [] outputs = body(*(inputs + dequeue_ops)) # If the computation only returned one value, make it a tuple. if not isinstance(outputs, (list, tuple)): outputs = (outputs,) outputs = [ o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o) for o in outputs ] # Separates the returned Operations and Tensors. output_operations = [o for o in outputs if isinstance(o, ops.Operation)] output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)] if outputs != output_tensors + output_operations: raise ValueError( "TPU training loop body must return zero or more Tensor values " "followed by zero or more Operations.") output_types = [op.dtype for op in output_tensors] if input_types != output_types: raise TypeError( "Mismatch between input types and output types for training loop " "body: {} vs {}".format(input_types, output_types)) # Add the dequeue operations to output_operations to ensure they are run # by the loop, even if the programmer's loop body does not use them. output_operations += dequeue_ops # Add a dummy output, if needed. if not output_tensors: output_tensors = array_ops.constant(0) if output_operations: # TODO (phawkins): in principle this is too restrictive since it serializes id:1047 gh:1048 # the training loop steps. In practice it does not matter since this loop # will be compiled by XLA. return control_flow_ops.tuple(output_tensors, control_inputs=output_operations) else: return output_tensors
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, has_control_flow = _PendingCount(ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: # pylint: disable=protected-access to_ops_set.add(op._id) queue.append(op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): if has_control_flow: control_flow_ops.EnterGradWhileContext(op) out_grads = _AggregatedGrads(grads, op, has_control_flow, aggregation_method) grad_fn = None if any(out_grads) and op._id not in stop_ops: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if grad_fn and any(out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not out_grad and dtypes.as_dtype(op.outputs[i].dtype).base_dtype in (dtypes.float32, dtypes.float64)): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. out_grads[i] = array_ops.zeros_like(op.outputs[i]) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access op_wrapper = op if has_control_flow: op_wrapper = control_flow_ops.MakeWrapper(op) in_grads = _AsList(grad_fn(op_wrapper, *out_grads)) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len(in_grads) > 1: in_grads = control_flow_ops.tuple(in_grads) logging.vlog(1, "Gradient for '" + op.name + "'") logging.vlog(1, " in --> %s", ", ".join([x.name for x in out_grads if x])) logging.vlog(1, " out --> %s", ", ".join([x.name for x in in_grads if x])) else: # If no grad_fn is defined or none of out_grads is available, # just propagates a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad: _SetGrad(grads, t_in, in_grad) if has_control_flow: control_flow_ops.ExitGradWhileContext(op) # update pending count for the inputs of op. for x in op.inputs: pending_count[x.op._id] -= 1 ready = (pending_count[x.op._id] == 0) if has_control_flow and not ready: ready = (pending_count[x.op._id] > 0 and control_flow_ops.IsLoopSwitch(x.op)) if ready: queue.append(x.op) for x in op.control_inputs: pending_count[x._id] -= 1 if pending_count[x._id] is 0: queue.append(x) return [_GetGrad(grads, x) for x in xs]
def training_graph(self, input_data, input_labels, random_seed, data_spec, epoch=None): """Constructs a TF graph for training a random tree. Args: input_data: A tensor or SparseTensor or placeholder for input data. input_labels: A tensor or placeholder for labels associated with input_data. random_seed: The random number generator seed to use for this tree. 0 means use the current time as the seed. data_spec: A list of tf.dtype values specifying the original types of each column. epoch: A tensor or placeholder for the epoch the training data comes from. Returns: The last op in the random tree training graph. """ epoch = [0] if epoch is None else epoch sparse_indices = [] sparse_values = [] sparse_shape = [] if isinstance(input_data, ops.SparseTensor): sparse_indices = input_data.indices sparse_values = input_data.values sparse_shape = input_data.shape input_data = [] # Count extremely random stats. ( node_sums, node_squares, splits_indices, splits_sums, splits_squares, totals_indices, totals_sums, totals_squares, input_leaves, ) = self.training_ops.count_extremely_random_stats( input_data, sparse_indices, sparse_values, sparse_shape, data_spec, input_labels, self.variables.tree, self.variables.tree_thresholds, self.variables.node_to_accumulator_map, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, self.variables.start_epoch, epoch, num_classes=self.params.num_output_columns, regression=self.params.regression, ) node_update_ops = [] node_update_ops.append(state_ops.assign_add(self.variables.node_sums, node_sums)) splits_update_ops = [] splits_update_ops.append( self.training_ops.scatter_add_ndim(self.variables.candidate_split_sums, splits_indices, splits_sums) ) splits_update_ops.append( self.training_ops.scatter_add_ndim(self.variables.accumulator_sums, totals_indices, totals_sums) ) if self.params.regression: node_update_ops.append(state_ops.assign_add(self.variables.node_squares, node_squares)) splits_update_ops.append( self.training_ops.scatter_add_ndim( self.variables.candidate_split_squares, splits_indices, splits_squares ) ) splits_update_ops.append( self.training_ops.scatter_add_ndim(self.variables.accumulator_squares, totals_indices, totals_squares) ) # Sample inputs. update_indices, feature_updates, threshold_updates = self.training_ops.sample_inputs( input_data, sparse_indices, sparse_values, sparse_shape, self.variables.node_to_accumulator_map, input_leaves, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, split_initializations_per_input=(self.params.split_initializations_per_input), split_sampling_random_seed=random_seed, ) update_features_op = state_ops.scatter_update( self.variables.candidate_split_features, update_indices, feature_updates ) update_thresholds_op = state_ops.scatter_update( self.variables.candidate_split_thresholds, update_indices, threshold_updates ) # Calculate finished nodes. with ops.control_dependencies(splits_update_ops): children = array_ops.squeeze(array_ops.slice(self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1]) is_leaf = math_ops.equal(constants.LEAF_NODE, children) leaves = math_ops.to_int32(array_ops.squeeze(array_ops.where(is_leaf), squeeze_dims=[1])) finished, stale = self.training_ops.finished_nodes( leaves, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, self.variables.start_epoch, epoch, num_split_after_samples=self.params.split_after_samples, min_split_samples=self.params.min_split_samples, ) # Update leaf scores. non_fertile_leaves = array_ops.boolean_mask( leaves, math_ops.less(array_ops.gather(self.variables.node_to_accumulator_map, leaves), 0) ) # TODO(gilberth): It should be possible to limit the number of non # fertile leaves we calculate scores for, especially since we can only take # at most array_ops.shape(finished)[0] of them. with ops.control_dependencies(node_update_ops): sums = array_ops.gather(self.variables.node_sums, non_fertile_leaves) if self.params.regression: squares = array_ops.gather(self.variables.node_squares, non_fertile_leaves) non_fertile_leaf_scores = self._variance(sums, squares) else: non_fertile_leaf_scores = self._weighted_gini(sums) # Calculate best splits. with ops.control_dependencies(splits_update_ops): split_indices = self.training_ops.best_splits( finished, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, regression=self.params.regression, ) # Grow tree. with ops.control_dependencies([update_features_op, update_thresholds_op]): ( tree_update_indices, tree_children_updates, tree_threshold_updates, tree_depth_updates, new_eot, ) = self.training_ops.grow_tree( self.variables.end_of_tree, self.variables.tree_depths, self.variables.node_to_accumulator_map, finished, split_indices, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, ) tree_update_op = state_ops.scatter_update(self.variables.tree, tree_update_indices, tree_children_updates) thresholds_update_op = state_ops.scatter_update( self.variables.tree_thresholds, tree_update_indices, tree_threshold_updates ) depth_update_op = state_ops.scatter_update( self.variables.tree_depths, tree_update_indices, tree_depth_updates ) # TODO(thomaswc): Only update the epoch on the new leaves. new_epoch_updates = epoch * array_ops.ones_like(tree_depth_updates) epoch_update_op = state_ops.scatter_update( self.variables.start_epoch, tree_update_indices, new_epoch_updates ) # Update fertile slots. with ops.control_dependencies([depth_update_op]): (node_map_updates, accumulators_cleared, accumulators_allocated) = self.training_ops.update_fertile_slots( finished, non_fertile_leaves, non_fertile_leaf_scores, self.variables.end_of_tree, self.variables.tree_depths, self.variables.accumulator_sums, self.variables.node_to_accumulator_map, stale, max_depth=self.params.max_depth, regression=self.params.regression, ) # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has # used it to calculate new leaves. gated_new_eot, = control_flow_ops.tuple([new_eot], control_inputs=[node_map_updates]) eot_update_op = state_ops.assign(self.variables.end_of_tree, gated_new_eot) updates = [] updates.append(eot_update_op) updates.append(tree_update_op) updates.append(thresholds_update_op) updates.append(epoch_update_op) updates.append( state_ops.scatter_update( self.variables.node_to_accumulator_map, array_ops.squeeze(array_ops.slice(node_map_updates, [0, 0], [1, -1]), squeeze_dims=[0]), array_ops.squeeze(array_ops.slice(node_map_updates, [1, 0], [1, -1]), squeeze_dims=[0]), ) ) cleared_and_allocated_accumulators = array_ops.concat(0, [accumulators_cleared, accumulators_allocated]) # Calculate values to put into scatter update for candidate counts. # Candidate split counts are always reset back to 0 for both cleared # and allocated accumulators. This means some accumulators might be doubly # reset to 0 if the were released and not allocated, then later allocated. split_values = array_ops.tile( array_ops.expand_dims( array_ops.expand_dims( array_ops.zeros_like(cleared_and_allocated_accumulators, dtype=dtypes.float32), 1 ), 2, ), [1, self.params.num_splits_to_consider, self.params.num_output_columns], ) updates.append( state_ops.scatter_update( self.variables.candidate_split_sums, cleared_and_allocated_accumulators, split_values ) ) if self.params.regression: updates.append( state_ops.scatter_update( self.variables.candidate_split_squares, cleared_and_allocated_accumulators, split_values ) ) # Calculate values to put into scatter update for total counts. total_cleared = array_ops.tile( array_ops.expand_dims(math_ops.neg(array_ops.ones_like(accumulators_cleared, dtype=dtypes.float32)), 1), [1, self.params.num_output_columns], ) total_reset = array_ops.tile( array_ops.expand_dims(array_ops.zeros_like(accumulators_allocated, dtype=dtypes.float32), 1), [1, self.params.num_output_columns], ) accumulator_updates = array_ops.concat(0, [total_cleared, total_reset]) updates.append( state_ops.scatter_update( self.variables.accumulator_sums, cleared_and_allocated_accumulators, accumulator_updates ) ) if self.params.regression: updates.append( state_ops.scatter_update( self.variables.accumulator_squares, cleared_and_allocated_accumulators, accumulator_updates ) ) # Calculate values to put into scatter update for candidate splits. split_features_updates = array_ops.tile( array_ops.expand_dims(math_ops.neg(array_ops.ones_like(cleared_and_allocated_accumulators)), 1), [1, self.params.num_splits_to_consider], ) updates.append( state_ops.scatter_update( self.variables.candidate_split_features, cleared_and_allocated_accumulators, split_features_updates ) ) updates += self.finish_iteration() return control_flow_ops.group(*updates)
def create_op(self, *args, **kwargs): """Creates an `Operation`. For operations of the following form orig_value = op(*args, **kwargs) this function constructs the following subgraph : v = Variable() if v is not initialized: orig_value = op(*args, **kwargs) v.assign(orig_value) # Initializes v return orig_value else: return v The above transformation is not performed and the original op is returned as is if any of the following is true: * `_return_as_is` flag is set to true. * op_type is listed in _PASS_THROUGH_OPS * op has no outputs. * One of the op's return value has a ref type. Args: *args: Arguments for create_op() **kwargs: Keyword arguments for create_op(). Refer to tensorflow.python.framework.ops.Graph.create_op() for the mandatory and optional arguments. Returns: An Operation. Raises: UnimplementedError: if output type is a reference and the op's type is not one of the supported types in `_REF_OPS_WHITELIST`. """ op_type = kwargs['op_type'] if 'op_type' in kwargs else args[0] output_dtypes = kwargs['dtypes'] if 'dtypes' in kwargs else args[2] output_dtypes = [dtypes.as_dtype(d) for d in output_dtypes] if self._return_as_is or op_type in _PASS_THROUGH_OPS: return self._wrap(super(ImperativeGraph, self).create_op(*args, **kwargs)) if not output_dtypes: return self._wrap( super(ImperativeGraph, self).create_op(*args, **kwargs)) output_has_ref = any([dtype._is_ref_dtype for dtype in output_dtypes]) # pylint: disable=protected-access if output_has_ref: if op_type not in _REF_OPS_WHITELIST: raise errors.UnimplementedError(None, None, op_type + ' op not supported in ' 'imperative graph') ret = super(ImperativeGraph, self).create_op(*args, **kwargs) if self._in_variable_creation: if op_type == 'Assign': self.add_pending_init(ret) return self._wrap(ret) with self.return_as_is(): # Declares the variables to hold the output values of this op. op_output_var = [state_ops.variable_op_v2( tensor_shape.TensorShape(None), dtype, container=self._name) for dtype in output_dtypes] # Ops to free the resources used by the temporary cache variables. # The following two ops are created for each cache variable, # having no control dependencies on any other ops : # var_handle_op ----> destroy_resource_op for dtype, v in zip(output_dtypes, op_output_var): with ops.control_dependencies(None): self._variable_cleanup_ops += [ gen_resource_variable_ops.destroy_resource_op( gen_resource_variable_ops.var_handle_op( dtype, tensor_shape.TensorShape(None), container=self._name, shared_name=v.op.name), ignore_lookup_error=True)] # Create the conditional to run the original op only when the variable # corresponding to the first output is not initialized. inited = state_ops.is_variable_initialized(op_output_var[0]) v_f, v_t = control_flow_ops.ref_switch(op_output_var[0], inited) # pylint: disable=protected-access v_f_op = gen_array_ops._ref_identity(v_f) v_t_op = gen_array_ops._ref_identity(v_t) # pylint: enable=protected-access with ops.control_dependencies([v_f_op.op]): # Create the original op orig_op = self._wrap( super(ImperativeGraph, self).create_op(*args, **kwargs)) shapes = [val.get_shape() for val in orig_op.outputs] controls = [] for var, val in zip(op_output_var, orig_op.outputs): if (not val.get_shape().is_fully_defined() or val.get_shape().num_elements() > 0): assign_op = state_ops.assign(var, val, validate_shape=False) assign_op.set_shape(val.get_shape()) controls.append(assign_op) values = [] if len(controls) > 1: if control_flow_ops.IsSwitch(orig_op): # pylint: disable=protected-access controls = gen_control_flow_ops._ref_merge(controls) # pylint: enable=protected-access else: controls = control_flow_ops.tuple(controls) for var, val in zip(op_output_var, orig_op.outputs): with ops.control_dependencies(controls): with self.colocate_with(v_f_op): real_val = array_ops.identity(val) with ops.control_dependencies([v_t_op.op]): with self.colocate_with(v_t_op): stored_val = array_ops.identity(var) stored_val.set_shape(val.get_shape()) real_val, _ = control_flow_ops.merge([real_val, stored_val]) real_val.op.node_def.attr['_gradient_op_type'].CopyFrom( attr_value_pb2.AttrValue(s=compat.as_bytes(self._merge_op_type))) values.append(real_val) for i, _ in enumerate(shapes): values[i].set_shape(shapes[i]) self._outputs_map[orig_op.name] = values try: self._gradient_function_map[orig_op.name] = ops.get_gradient_function( orig_op) except (KeyError, LookupError): pass else: orig_op.node_def.attr['_gradient_op_type'].CopyFrom( attr_value_pb2.AttrValue( s=compat.as_bytes(self._imperative_op_type))) return MultiOutputOperation(values, orig_op)
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: ```python a = tf.constant(0.) b = 2 * a g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) ``` Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: ```python a = tf.stop_gradient(tf.constant(0.)) b = tf.stop_gradient(2 * a) g = tf.gradients(a + b, [a, b]) ``` `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ if context.in_eager_mode(): raise RuntimeError("tf.gradients not supported in EAGER mode. Use " "functions in tf.contrib.eager.backprop instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits( pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access func_call = None is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any( isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function( op.type) grad_fn = func_call.python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th id:3537 gh:3538 # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO (apassos) gradients of resource handles might be an id:3152 gh:3153 # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[ i] = control_flow_ops.ZerosLikeOutsideLoop( op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops.colocate_with( None, ignore_existing=True): in_grads = control_flow_ops.tuple( in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def _GradientsHelper(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE, src_graph=None): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError( "tf.gradients is not supported when eager execution " "is enabled. Use tf.GradientTape instead.") if src_graph is None: src_graph = ops.get_default_graph() try: unconnected_gradients = UnconnectedGradients(unconnected_gradients) except ValueError: raise ValueError("Unknown value for unconnected_gradients: %r" % unconnected_gradients) # If src_graph is a _FuncGraph (i.e. a function body), gather it and all # ancestor graphs. This is necessary for correctly handling captured values. func_graphs = [] curr_graph = src_graph while _IsFunction(curr_graph): func_graphs.append(curr_graph) if isinstance(curr_graph, FuncGraph): curr_graph = curr_graph.outer_graph else: assert isinstance(curr_graph, framework_function._FuncGraph) # pylint: disable=protected-access curr_graph = curr_graph._outer_graph # pylint: disable=protected-access ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: # Get a uid for this call to gradients that can be used to help # cluster ops for compilation. gradient_uid = ops.get_default_graph().unique_name("uid") ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x", as_ref=True) xs_set = object_identity.ObjectIdentitySet(xs) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops, gradient_uid) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] reachable_to_ops, pending_count, loop_state = _PendingCount( to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs_set) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. ready = (pending_count[op] == 0) if ready and op not in to_ops_set and op in reachable_to_ops: to_ops_set.add(op) queue.append(op) if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits( pending_count, to_ops_set) for y in loop_exits: if backprop_util.IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs_set) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None func_call = None is_partitioned_call = _IsPartitionedCall(op) # pylint: disable=protected-access is_func_call = (src_graph._is_function(op.type) or is_partitioned_call) # pylint: enable=protected-access has_out_grads = any( isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op not in stop_ops): try: grad_fn = ops.get_gradient_function(op) except LookupError: if is_func_call: if is_partitioned_call: func_call = src_graph._get_function( # pylint: disable=protected-access compat.as_bytes(op.get_attr("f").name)) else: func_call = src_graph._get_function(op.type) # pylint: disable=protected-access # Note that __defun is not set if the graph is # imported. If it's set, we prefer to access the original # defun. func_call = getattr(op, "__defun", func_call) grad_fn = func_call.python_grad_func else: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) # NOTE(skyewm): We don't support computing gradients wrt a loop variable # unless it's within the context of a single iteration (i.e. the # gradient is wrt to the loop parameter in the body function, not wrt or # through the initial value). This means if we're in a while loop # context, we should never see a switch node from this context. # pylint: disable=protected-access if (control_flow_util.IsSwitch(op) and op._control_flow_context is not None and op._control_flow_context.IsWhileContext() and op._control_flow_context == ops.get_default_graph()._get_control_flow_context()): _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs_set) # pylint: enable=protected-access if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or backprop_util.IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) elif default_gradient.supports_default_grad( op.outputs[i]): # TODO(b/143286622): The supports_default_grad check is needed # because While op emits non-differentiable resource tensors # as outputs. Remove this check when that is not the case. out_grads[ i] = control_flow_state.ZerosLikeOutsideLoop( op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with src_graph._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops._colocate_with_for_gradient( # pylint: disable=protected-access None, gradient_uid, ignore_existing=True): in_grads = control_flow_ops.tuple( in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(_Inputs(op, xs_set)) # Note: we don't filter out eager inputs here because the inputs need to # line up with in_grads. for i, (t_in, in_grad) in enumerate( zip(_Inputs(op, xs_set), in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) if not isinstance(t_in, ops.EagerTensor): _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state, xs_set) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKey.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. RuntimeError: If called with eager execution enabled and if `grad_loss` is not `None` or `loss` is not callable. @compatibility(eager) When eager execution is enabled, `loss` should be a Python function that takes elements of `var_list` as arguments and computes the value to be minimized. If `var_list` is None, `loss` should take no arguments. Gradient computation is done with respect to the elements of `var_list` if not None, else with respect to any trainable variables created during the execution of the `loss` function. `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and `grad_loss` are ignored when eager execution is enabled. @end_compatibility """ if context.in_eager_mode(): if grad_loss is not None: raise RuntimeError( "`grad_loss` argument to Optimizer.compute_gradients " "not supported when eager execution is enabled.") if not callable(loss): raise RuntimeError( "`loss` passed to Optimizer.compute_gradients should " "be a function when eager execution is enabled.") # TODO(agarwal): consider passing parameters to the `loss` function. if var_list is None: return backprop.implicit_grad(loss)() else: var_list = nest.flatten(var_list) grads = backprop.gradients_function(loss)(*var_list) grads_and_vars = list(zip(grads, var_list)) return grads_and_vars if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # pylint: enable=protected-access processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes( [v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource]) return grads_and_vars
def execute(self, fn, *args, **kwargs): """Execute function `fn(*args, **kwargs)` inside the CriticalSection. Args: fn: The function to execute. Must return at least one tensor. *args: Additional positional arguments to `fn`. **kwargs: Additional keyword arguments to `fn`. Several keywords are reserved for `execute`. These are: - name; The name to use when creating the execute operation. - exclusive_resource_access; Whether the resources required by `fn` should be exclusive to this `CriticalSection`. Default: `True`. You may want to set this to `False` if you will be accessing a resource in read-only mode in two different CriticalSections. Returns: The tensors returned from `fn(*args, **kwargs)`. Raises: ValueError: If `fn` attempts to use this `CriticalSection` in any nested way. ValueError: If `exclusive_resource_access` is not provided (is `True`) and another `CriticalSection` has an execution requesting the same resources as in `*args`, `**kwargs`, and any additionaly captured inputs in `fn`. Note, even if `exclusive_resource_access` is `True`, if another execution in another `CriticalSection` was created without `exclusive_resource_access=True`, a `ValueError` will be raised. """ name = kwargs.pop("name", None) exclusive_resource_access = kwargs.pop("exclusive_resource_access", True) with ops.name_scope(name, "critical_section_execute", []): lock = gen_resource_variable_ops.mutex_lock(self._handle) with ops.control_dependencies([lock]): c_known_ops = set() c_captured_tensors = set() def add_op_internal(op): c_known_ops.add(op) for i in op.inputs: if i.op not in c_known_ops: c_captured_tensors.add(i) c = function.HelperContext(add_op_internal) with c: r = fn(*args, **kwargs) resource_inputs = set([ x for x in list(nest.flatten(args)) + nest.flatten(kwargs.values()) + list(c_captured_tensors) if tensor_util.is_tensor(x) and x.dtype == dtypes.resource]) if self._handle in resource_inputs: raise ValueError("The function fn attempts to access the " "CriticalSection in which it would be running. " "This is illegal and would cause deadlocks. " "CriticalSection: %s." % self._handle) if not context.executing_eagerly(): # Collections and op introspection does not work in eager # mode. This is generally ok; since eager mode (as of # writing) executes sequentially anyway. for sg in ops.get_collection(CRITICAL_SECTION_EXECUTIONS): sg_handle_name = ops.convert_to_tensor(sg.handle).name self_handle_name = ops.convert_to_tensor(self._handle).name if sg_handle_name == self_handle_name: # Other executions in the same critical section are allowed. continue if not (exclusive_resource_access or sg.exclusive_resource_access): # Neither execution requested exclusive access. continue resource_intersection = resource_inputs.intersection(sg.resources) if resource_intersection: raise ValueError( "This execution would access resources: %s. Either this " "lock (CriticalSection: %s) or lock '%s' " "(CriticalSection: %s) requested exclusive resource access " "of this resource. Did you mean to call execute with keyword " "argument exclusive_resource_access=False?" % (list(resource_intersection), self._handle.name, sg.op.name, sg.handle.name)) def identity(x): # pylint: disable=invalid-name if isinstance(x, tensor_array_ops.TensorArray): return x.identity() elif isinstance(x, ops.Operation): return control_flow_ops.group(x) elif context.executing_eagerly() and x is None: return None else: return array_ops.identity(x) r_flat = [identity(x) for x in nest.flatten(r)] with ops.control_dependencies(r_flat): # The identity must run on the same machine as self._handle with ops.colocate_with(self._handle): # Do not use array_ops.identity as there are special # optimizations within TensorFlow which seem to elide it # even when optimizations are disabled(!). ensure_lock_exists = gen_resource_variable_ops.consume_mutex_lock( lock) # Make sure that if any element of r is accessed, all of # them are executed together. r = nest.pack_sequence_as( r, control_flow_ops.tuple(nest.flatten(r))) with ops.control_dependencies([ensure_lock_exists]): outputs = nest.map_structure(identity, r) if not context.executing_eagerly(): signature = _ExecutionSignature( op=lock.op, handle=self._handle, resources=list(resource_inputs), exclusive_resource_access=exclusive_resource_access) ops.add_to_collections( CRITICAL_SECTION_EXECUTIONS, signature) return outputs
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope(name, "gradients", ys + xs + grad_ys): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: grad_fn = ops.get_default_graph()._get_function( op.type).python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and _IsTrainable(op.outputs[i]): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = grad_fn(op, *out_grads) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _SymGrad(op, out_grads) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad is not None: if isinstance(in_grad, ops.Tensor): in_grad.set_shape(t_in.get_shape()) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError("tf.gradients not supported when eager execution " "is enabled. Use tf.contrib.eager.GradientTape " "instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: # Get a uid for this call to gradients that can be used to help # cluster ops for compilation. gradient_uid = ops.get_default_graph().unique_name("uid") ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops, gradient_uid) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] reachable_to_ops, pending_count, loop_state = _PendingCount( ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set and op._id in reachable_to_ops: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits( pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None func_call = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) # pylint: enable=protected-access has_out_grads = any( isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function( op.type) # Note that __defun is not set if the graph is # imported. If it's set, we prefer to access the original # defun. func_call = getattr(op, "__defun", func_call) grad_fn = func_call.python_grad_func else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[ i] = control_flow_ops.ZerosLikeOutsideLoop( op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops._colocate_with_for_gradient( # pylint: disable=protected-access None, gradient_uid, ignore_existing=True): in_grads = control_flow_ops.tuple( in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def execute(self, fn, exclusive_resource_access=True, name=None): """Execute function `fn()` inside the critical section. `fn` should not accept any arguments. To add extra arguments to when calling `fn` in the critical section, create a lambda: ```python critical_section.execute(lambda: fn(*my_args, **my_kwargs)) ``` Args: fn: The function to execute. Must return at least one tensor. exclusive_resource_access: Whether the resources required by `fn` should be exclusive to this `CriticalSection`. Default: `True`. You may want to set this to `False` if you will be accessing a resource in read-only mode in two different CriticalSections. name: The name to use when creating the execute operation. Returns: The tensors returned from `fn()`. Raises: ValueError: If `fn` attempts to lock this `CriticalSection` in any nested or lazy way that may cause a deadlock. ValueError: If `exclusive_resource_access == True` and another `CriticalSection` has an execution requesting the same resources as `fn``. Note, even if `exclusive_resource_access` is `True`, if another execution in another `CriticalSection` was created without `exclusive_resource_access=True`, a `ValueError` will be raised. """ with ops.name_scope(name, "critical_section_execute", []): # Ensure that mutex locking only happens *after* all args and # kwargs have been executed. This avoids certain types of deadlocks. lock = gen_resource_variable_ops.mutex_lock(self._handle) if not context.executing_eagerly(): # NOTE(ebrevdo): This is to ensure we don't pick up spurious # Operations created by other threads. with ops.get_default_graph()._lock: # pylint: disable=protected-access existing_ops = ops.get_default_graph().get_operations() with ops.control_dependencies([lock]): r = fn() # TODO(ebrevdo): If creating critical sections in a python loop, this # makes graph creation time quadratic. Revisit if this # becomes a problem. created_ops = (set(ops.get_default_graph().get_operations()) .difference(existing_ops)) else: with ops.control_dependencies([lock]): r = fn() if not context.executing_eagerly(): self._add_control_dependencies_to_lock(created_ops, lock.op) # captured_resources is a list of resources that are directly # accessed only by ops created during fn(), not by any # ancestors of those ops in the graph. captured_resources = set([ input_ for op in created_ops for input_ in op.inputs if input_.dtype == dtypes.resource ]) # NOTE(ebrevdo): The only time self._is_self_handle() is True # in this call is if one of the recently created ops, within # the execute(), themselves attempt to access the # CriticalSection. This will cause a deadlock. if any(self._is_self_handle(x) for x in captured_resources): raise ValueError("The function fn attempts to directly access the " "CriticalSection in which it would be running. " "This is illegal and would cause deadlocks.") self._check_multiple_access_to_resources( captured_resources, exclusive_resource_access) r_flat = [_identity(x) for x in nest.flatten(r)] with ops.control_dependencies(r_flat): # The identity must run on the same machine as self._handle with ops.colocate_with(self._handle): # Do not use array_ops.identity as there are special # optimizations within TensorFlow which seem to elide it # even when optimizations are disabled(!). ensure_lock_exists = gen_resource_variable_ops.consume_mutex_lock( lock) # Make sure that if any element of r is accessed, all of # them are executed together. r = nest.pack_sequence_as(r, control_flow_ops.tuple(nest.flatten(r))) with ops.control_dependencies([ensure_lock_exists]): outputs = nest.map_structure(_identity, r) if not context.executing_eagerly(): signature = _ExecutionSignature( op=lock.op, handle=self._handle, resources=list(captured_resources), exclusive_resource_access=exclusive_resource_access) ops.add_to_collections( CRITICAL_SECTION_EXECUTIONS, signature) return outputs
def compute_gradients( self, loss, var_list=None, # Copy-paste and black magic gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): if callable(loss): with backprop.GradientTape() as tape: if var_list is not None: tape.watch(var_list) loss_value = loss() if var_list is None: var_list = tape.watched_variables() grads = tape.gradient(loss_value, var_list, grad_loss) return list(zip(grads, var_list)) if gate_gradients not in [ optimizer.Optimizer.GATE_NONE, optimizer.Optimizer.GATE_OP, optimizer.Optimizer.GATE_GRAPH ]: raise ValueError( "gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( tf.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # pylint: enable=protected-access processors = [optimizer._get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] act_refs = [] if (self._B == []): stddev = math_ops.cast(self._stddev_t, tf.float32) for v in var_refs: shape = [10] + v.get_shape().as_list() shape.reverse() if (self._stddev == 0): stddev = tf.rsqrt(math_ops.cast(v.shape[-1], tf.float32)) self._B.append( tf.Variable(tf.random_uniform(shape, minval=tf.multiply( -1., stddev), maxval=stddev), trainable=False, name=v.op.name + '/B') ) # random matrix, uniform distribution between [-stddev, stddev] act_name = "/".join(v.op.name.split("/")[:-1]) + "/activations" act_refs.append(tf.get_default_graph().get_operation_by_name( act_name).outputs[0]) f_grad = tf.reduce_mean(tf.gradients(loss, act_refs[-1])[0], axis=0) grads = [ tf.multiply( tf.reduce_sum(tf.transpose(tf.multiply(f_grad, self._B[i])), axis=0), tf.gradients( act_refs[i], var_refs[i], grad_ys=grad_loss, gate_gradients=( gate_gradients == optimizer.Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) [0]) for i in range(0, len(var_refs) - 2) ] grads.append( tf.gradients( loss, var_refs[-2], grad_ys=grad_loss, gate_gradients=(gate_gradients == optimizer.Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops)[0]) grads.append( tf.gradients( loss, var_refs[-1], gate_gradients=(gate_gradients == optimizer.Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops)[0] ) # Get the gradients for the final layer as tensorflow does normally(?) if gate_gradients == optimizer.Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([ v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource ]) return grads_and_vars
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: ```python a = tf.constant(0.) b = 2 * a g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) ``` Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: ```python a = tf.stop_gradient(tf.constant(0.)) b = tf.stop_gradient(2 * a) g = tf.gradients(a + b, [a, b]) ``` `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ if context.in_eager_mode(): raise RuntimeError("tf.gradients not supported in EAGER mode. Use " "functions in tf.contrib.eager.backprop instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices( xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount( ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access func_call = None is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function(op.type) grad_fn = func_call.python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len([x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops.colocate_with(None, ignore_existing=True): in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize or a callable taking no arguments which returns the value to minimize. When eager execution is enabled it must be a callable. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. RuntimeError: If called with eager execution enabled and `loss` is not callable. @compatibility(eager) When eager execution is enabled, `gate_gradients`, `aggregation_method`, and `colocate_gradients_with_ops` are ignored. @end_compatibility """ if callable(loss): with backprop.GradientTape() as tape: if var_list is not None: tape.watch(var_list) loss_value = loss() # Scale loss if using a "mean" loss reduction and multiple replicas. # Have to be careful to call distribute_lib.get_loss_reduction() # *after* loss() is evaluated, so we know what loss reduction it uses. # TODO(josh11b): Test that we handle weight decay in a reasonable way. loss_value = self._scale_loss(loss_value) if var_list is None: var_list = tape.watched_variables() # TODO(jhseu): Figure out why GradientTape's gradients don't require loss # to be executed. with ops.control_dependencies([loss_value]): grads = tape.gradient(loss_value, var_list, grad_loss) return list(zip(grads, var_list)) # Non-callable/Tensor loss case if context.executing_eagerly(): raise RuntimeError( "`loss` passed to Optimizer.compute_gradients should " "be a function when eager execution is enabled.") # Scale loss if using a "mean" loss reduction and multiple replicas. loss = self._scale_loss(loss) if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # pylint: enable=protected-access processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes( [v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource]) return grads_and_vars
def execute(self, fn, *args, **kwargs): """Execute function `fn(*args, **kwargs)` inside the CriticalSection. Args: fn: The function to execute. Must return at least one tensor. *args: Additional positional arguments to `fn`. **kwargs: Additional keyword arguments to `fn`. Several keywords are reserved for `execute`. These are: - name; The name to use when creating the execute operation. - exclusive_resource_access; Whether the resources required by `fn` should be exclusive to this `CriticalSection`. Default: `True`. You may want to set this to `False` if you will be accessing a resource in read-only mode in two different CriticalSections. Returns: The tensors returned from `fn(*args, **kwargs)`. Raises: ValueError: If `fn` attempts to lock this `CriticalSection` in any nested or lazy way that may cause a deadlock. ValueError: If `exclusive_resource_access` is not provided (is `True`) and another `CriticalSection` has an execution requesting the same resources as in `*args`, `**kwargs`, and any additionaly captured inputs in `fn`. Note, even if `exclusive_resource_access` is `True`, if another execution in another `CriticalSection` was created without `exclusive_resource_access=True`, a `ValueError` will be raised. """ name = kwargs.pop("name", None) exclusive_resource_access = kwargs.pop("exclusive_resource_access", True) with ops.name_scope(name, "critical_section_execute", []): # Ensure that mutex locking only happens *after* all args and # kwargs have been executed. This avoids certain types of deadlocks. lock = gen_resource_variable_ops.mutex_lock(self._handle) if not context.executing_eagerly(): # NOTE (ebrevdo): This is to ensure we don't pick up spurious id:1153 # https://github.com/imdone/tensorflow/issues/1154 # Operations created by other threads. # with ops.get_default_graph()._lock: # pylint: disable=protected-access existing_ops = ops.get_default_graph().get_operations() with ops.control_dependencies([lock]): r = fn(*args, **kwargs) # TODO (ebrevdo): If creating critical sections in a python loop, this id:1258 # https://github.com/imdone/tensorflow/issues/1259 # makes graph creation time quadratic. Revisit if this # becomes a problem. created_ops = (set( ops.get_default_graph().get_operations()).difference( existing_ops)) else: with ops.control_dependencies([lock]): r = fn(*args, **kwargs) if not context.executing_eagerly(): self._add_control_dependencies_to_lock(created_ops, lock.op) # captured_resources is a list of resources that are directly # accessed only by ops created during fn(), not by any # ancestors of those ops in the graph. captured_resources = set([ input_ for op in created_ops for input_ in op.inputs if input_.dtype == dtypes.resource ]) # NOTE (ebrevdo): The only time self._is_self_handle() is True id:859 # https://github.com/imdone/tensorflow/issues/860 # in this call is if one of the recently created ops, within # the execute(), themselves attempt to access the # CriticalSection. This will cause a deadlock. if any(self._is_self_handle(x) for x in captured_resources): raise ValueError( "The function fn attempts to directly access the " "CriticalSection in which it would be running. " "This is illegal and would cause deadlocks.") self._check_multiple_access_to_resources( captured_resources, exclusive_resource_access) r_flat = [_identity(x) for x in nest.flatten(r)] with ops.control_dependencies(r_flat): # The identity must run on the same machine as self._handle with ops.colocate_with(self._handle): # Do not use array_ops.identity as there are special # optimizations within TensorFlow which seem to elide it # even when optimizations are disabled(!). ensure_lock_exists = gen_resource_variable_ops.consume_mutex_lock( lock) # Make sure that if any element of r is accessed, all of # them are executed together. r = nest.pack_sequence_as( r, control_flow_ops.tuple(nest.flatten(r))) with ops.control_dependencies([ensure_lock_exists]): outputs = nest.map_structure(_identity, r) if not context.executing_eagerly(): signature = _ExecutionSignature( op=lock.op, handle=self._handle, resources=list(captured_resources), exclusive_resource_access=exclusive_resource_access) ops.add_to_collections(CRITICAL_SECTION_EXECUTIONS, signature) return outputs
def training_graph(self, input_data, input_labels, random_seed, data_spec, input_weights=None): """Constructs a TF graph for training a random tree. Args: input_data: A tensor or SparseTensor or placeholder for input data. input_labels: A tensor or placeholder for labels associated with input_data. random_seed: The random number generator seed to use for this tree. 0 means use the current time as the seed. data_spec: A list of tf.dtype values specifying the original types of each column. input_weights: A float tensor or placeholder holding per-input weights, or None if all inputs are to be weighted equally. Returns: The last op in the random tree training graph. """ epoch = math_ops.to_int32(get_epoch_variable()) if input_weights is None: input_weights = [] sparse_indices = [] sparse_values = [] sparse_shape = [] if isinstance(input_data, sparse_tensor.SparseTensor): sparse_indices = input_data.indices sparse_values = input_data.values sparse_shape = input_data.dense_shape input_data = [] # Count extremely random stats. (node_sums, node_squares, splits_indices, splits_sums, splits_squares, totals_indices, totals_sums, totals_squares, input_leaves) = (self.training_ops.count_extremely_random_stats( input_data, sparse_indices, sparse_values, sparse_shape, data_spec, input_labels, input_weights, self.variables.tree, self.variables.tree_thresholds, self.variables.node_to_accumulator_map, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, self.variables.start_epoch, epoch, num_classes=self.params.num_output_columns, regression=self.params.regression)) node_update_ops = [] node_update_ops.append( state_ops.assign_add(self.variables.node_sums, node_sums)) splits_update_ops = [] splits_update_ops.append( self.training_ops.scatter_add_ndim( self.variables.candidate_split_sums, splits_indices, splits_sums)) splits_update_ops.append( self.training_ops.scatter_add_ndim(self.variables.accumulator_sums, totals_indices, totals_sums)) if self.params.regression: node_update_ops.append( state_ops.assign_add(self.variables.node_squares, node_squares)) splits_update_ops.append( self.training_ops.scatter_add_ndim( self.variables.candidate_split_squares, splits_indices, splits_squares)) splits_update_ops.append( self.training_ops.scatter_add_ndim( self.variables.accumulator_squares, totals_indices, totals_squares)) # Sample inputs. update_indices, feature_updates, threshold_updates = ( self.training_ops.sample_inputs( input_data, sparse_indices, sparse_values, sparse_shape, input_weights, self.variables.node_to_accumulator_map, input_leaves, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, split_initializations_per_input=( self.params.split_initializations_per_input), split_sampling_random_seed=random_seed)) update_features_op = state_ops.scatter_update( self.variables.candidate_split_features, update_indices, feature_updates) update_thresholds_op = state_ops.scatter_update( self.variables.candidate_split_thresholds, update_indices, threshold_updates) # Calculate finished nodes. with ops.control_dependencies(splits_update_ops): # Passing input_leaves to finished nodes here means that nodes that # have become stale won't be deallocated until an input reaches them, # because we're trying to avoid considering every fertile node for # performance reasons. finished, stale = self.training_ops.finished_nodes( input_leaves, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, self.variables.start_epoch, epoch, num_split_after_samples=self.params.split_after_samples, min_split_samples=self.params.min_split_samples, dominate_method=self.params.dominate_method, dominate_fraction=self.params.dominate_fraction) # Update leaf scores. # TODO(thomaswc): Store the leaf scores in a TopN and only update the # scores of the leaves that were touched by this batch of input. children = array_ops.squeeze(array_ops.slice(self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1]) is_leaf = math_ops.equal(constants.LEAF_NODE, children) leaves = math_ops.to_int32( array_ops.squeeze(array_ops.where(is_leaf), squeeze_dims=[1])) non_fertile_leaves = array_ops.boolean_mask( leaves, math_ops.less( array_ops.gather(self.variables.node_to_accumulator_map, leaves), 0)) # TODO(gilberth): It should be possible to limit the number of non # fertile leaves we calculate scores for, especially since we can only take # at most array_ops.shape(finished)[0] of them. with ops.control_dependencies(node_update_ops): sums = array_ops.gather(self.variables.node_sums, non_fertile_leaves) if self.params.regression: squares = array_ops.gather(self.variables.node_squares, non_fertile_leaves) non_fertile_leaf_scores = self._variance(sums, squares) else: non_fertile_leaf_scores = self._weighted_gini(sums) # Calculate best splits. with ops.control_dependencies(splits_update_ops): split_indices = self.training_ops.best_splits( finished, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, regression=self.params.regression) # Grow tree. with ops.control_dependencies( [update_features_op, update_thresholds_op]): (tree_update_indices, tree_children_updates, tree_threshold_updates, new_eot) = (self.training_ops.grow_tree( self.variables.end_of_tree, self.variables.node_to_accumulator_map, finished, split_indices, self.variables.candidate_split_features, self.variables.candidate_split_thresholds)) tree_update_op = state_ops.scatter_update(self.variables.tree, tree_update_indices, tree_children_updates) thresholds_update_op = state_ops.scatter_update( self.variables.tree_thresholds, tree_update_indices, tree_threshold_updates) # TODO(thomaswc): Only update the epoch on the new leaves. new_epoch_updates = epoch * array_ops.ones_like( tree_threshold_updates, dtype=dtypes.int32) epoch_update_op = state_ops.scatter_update( self.variables.start_epoch, tree_update_indices, new_epoch_updates) # Update fertile slots. with ops.control_dependencies([tree_update_op]): (n2a_map_updates, a2n_map_updates, accumulators_cleared, accumulators_allocated) = (self.training_ops.update_fertile_slots( finished, non_fertile_leaves, non_fertile_leaf_scores, self.variables.end_of_tree, self.variables.accumulator_sums, self.variables.node_to_accumulator_map, stale, self.variables.node_sums, regression=self.params.regression)) # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has # used it to calculate new leaves. gated_new_eot, = control_flow_ops.tuple( [new_eot], control_inputs=[n2a_map_updates]) eot_update_op = state_ops.assign(self.variables.end_of_tree, gated_new_eot) updates = [] updates.append(eot_update_op) updates.append(tree_update_op) updates.append(thresholds_update_op) updates.append(epoch_update_op) updates.append( state_ops.scatter_update(self.variables.node_to_accumulator_map, n2a_map_updates[0], n2a_map_updates[1])) updates.append( state_ops.scatter_update(self.variables.accumulator_to_node_map, a2n_map_updates[0], a2n_map_updates[1])) cleared_and_allocated_accumulators = array_ops.concat( 0, [accumulators_cleared, accumulators_allocated]) # Calculate values to put into scatter update for candidate counts. # Candidate split counts are always reset back to 0 for both cleared # and allocated accumulators. This means some accumulators might be doubly # reset to 0 if the were released and not allocated, then later allocated. split_values = array_ops.tile( array_ops.expand_dims( array_ops.expand_dims( array_ops.zeros_like(cleared_and_allocated_accumulators, dtype=dtypes.float32), 1), 2), [ 1, self.params.num_splits_to_consider, self.params.num_output_columns ]) updates.append( state_ops.scatter_update(self.variables.candidate_split_sums, cleared_and_allocated_accumulators, split_values)) if self.params.regression: updates.append( state_ops.scatter_update( self.variables.candidate_split_squares, cleared_and_allocated_accumulators, split_values)) # Calculate values to put into scatter update for total counts. total_cleared = array_ops.tile( array_ops.expand_dims( math_ops.neg( array_ops.ones_like(accumulators_cleared, dtype=dtypes.float32)), 1), [1, self.params.num_output_columns]) total_reset = array_ops.tile( array_ops.expand_dims( array_ops.zeros_like(accumulators_allocated, dtype=dtypes.float32), 1), [1, self.params.num_output_columns]) accumulator_updates = array_ops.concat(0, [total_cleared, total_reset]) updates.append( state_ops.scatter_update(self.variables.accumulator_sums, cleared_and_allocated_accumulators, accumulator_updates)) if self.params.regression: updates.append( state_ops.scatter_update(self.variables.accumulator_squares, cleared_and_allocated_accumulators, accumulator_updates)) # Calculate values to put into scatter update for candidate splits. split_features_updates = array_ops.tile( array_ops.expand_dims( math_ops.neg( array_ops.ones_like(cleared_and_allocated_accumulators)), 1), [1, self.params.num_splits_to_consider]) updates.append( state_ops.scatter_update(self.variables.candidate_split_features, cleared_and_allocated_accumulators, split_features_updates)) updates += self.finish_iteration() return control_flow_ops.group(*updates)
def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError("tf.gradients not supported when eager execution " "is enabled. Use tf.contrib.eager.GradientTape " "instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices( xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount( ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access func_call = None is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function(op.type) grad_fn = func_call.python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len([x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops.colocate_with(None, ignore_existing=True): in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def training_graph(self, input_data, input_labels, random_seed, data_spec, sparse_features=None, input_weights=None): """Constructs a TF graph for training a random tree. Args: input_data: A tensor or placeholder for input data. input_labels: A tensor or placeholder for labels associated with input_data. random_seed: The random number generator seed to use for this tree. 0 means use the current time as the seed. data_spec: A data_ops.TensorForestDataSpec object specifying the original feature/columns of the data. sparse_features: A tf.SparseTensor for sparse input data. input_weights: A float tensor or placeholder holding per-input weights, or None if all inputs are to be weighted equally. Returns: The last op in the random tree training graph. """ epoch = math_ops.to_int32(get_epoch_variable()) serialized_input_spec = data_spec.SerializeToString() if input_weights is None: input_weights = [] if input_data is None: input_data = [] sparse_indices = [] sparse_values = [] sparse_shape = [] if sparse_features is not None: sparse_indices = sparse_features.indices sparse_values = sparse_features.values sparse_shape = sparse_features.dense_shape # Count extremely random stats. (node_sums, node_squares, splits_indices, splits_sums, splits_squares, totals_indices, totals_sums, totals_squares, input_leaves) = (tensor_forest_ops.count_extremely_random_stats( input_data, sparse_indices, sparse_values, sparse_shape, input_labels, input_weights, self.variables.tree, self.variables.tree_thresholds, self.variables.node_to_accumulator_map, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, self.variables.start_epoch, epoch, input_spec=serialized_input_spec, num_classes=self.params.num_output_columns, regression=self.params.regression)) node_update_ops = [] node_update_ops.append( state_ops.assign_add(self.variables.node_sums, node_sums)) splits_update_ops = [] splits_update_ops.append( tensor_forest_ops.scatter_add_ndim(self.variables.candidate_split_sums, splits_indices, splits_sums)) splits_update_ops.append( tensor_forest_ops.scatter_add_ndim(self.variables.accumulator_sums, totals_indices, totals_sums)) if self.params.regression: node_update_ops.append(state_ops.assign_add(self.variables.node_squares, node_squares)) splits_update_ops.append( tensor_forest_ops.scatter_add_ndim( self.variables.candidate_split_squares, splits_indices, splits_squares)) splits_update_ops.append( tensor_forest_ops.scatter_add_ndim(self.variables.accumulator_squares, totals_indices, totals_squares)) # Sample inputs. update_indices, feature_updates, threshold_updates = ( tensor_forest_ops.sample_inputs( input_data, sparse_indices, sparse_values, sparse_shape, input_weights, self.variables.node_to_accumulator_map, input_leaves, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, input_spec=serialized_input_spec, split_initializations_per_input=( self.params.split_initializations_per_input), split_sampling_random_seed=random_seed)) update_features_op = state_ops.scatter_update( self.variables.candidate_split_features, update_indices, feature_updates) update_thresholds_op = state_ops.scatter_update( self.variables.candidate_split_thresholds, update_indices, threshold_updates) # Calculate finished nodes. with ops.control_dependencies(splits_update_ops): # Passing input_leaves to finished nodes here means that nodes that # have become stale won't be deallocated until an input reaches them, # because we're trying to avoid considering every fertile node for # performance reasons. finished, stale = tensor_forest_ops.finished_nodes( input_leaves, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, self.variables.start_epoch, epoch, num_split_after_samples=self.params.split_after_samples, min_split_samples=self.params.min_split_samples, dominate_method=self.params.dominate_method, dominate_fraction=self.params.dominate_fraction) # Update leaf scores. # TODO(thomaswc): Store the leaf scores in a TopN and only update the # scores of the leaves that were touched by this batch of input. children = array_ops.squeeze( array_ops.slice(self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1]) is_leaf = math_ops.equal(constants.LEAF_NODE, children) leaves = math_ops.to_int32( array_ops.squeeze( array_ops.where(is_leaf), squeeze_dims=[1])) non_fertile_leaves = array_ops.boolean_mask( leaves, math_ops.less(array_ops.gather( self.variables.node_to_accumulator_map, leaves), 0)) # TODO(gilberth): It should be possible to limit the number of non # fertile leaves we calculate scores for, especially since we can only take # at most array_ops.shape(finished)[0] of them. with ops.control_dependencies(node_update_ops): sums = array_ops.gather(self.variables.node_sums, non_fertile_leaves) if self.params.regression: squares = array_ops.gather(self.variables.node_squares, non_fertile_leaves) non_fertile_leaf_scores = self._variance(sums, squares) else: non_fertile_leaf_scores = self._weighted_gini(sums) # Calculate best splits. with ops.control_dependencies(splits_update_ops): split_indices = tensor_forest_ops.best_splits( finished, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, regression=self.params.regression) # Grow tree. with ops.control_dependencies([update_features_op, update_thresholds_op]): (tree_update_indices, tree_children_updates, tree_threshold_updates, new_eot) = (tensor_forest_ops.grow_tree( self.variables.end_of_tree, self.variables.node_to_accumulator_map, finished, split_indices, self.variables.candidate_split_features, self.variables.candidate_split_thresholds)) tree_update_op = state_ops.scatter_update( self.variables.tree, tree_update_indices, tree_children_updates) thresholds_update_op = state_ops.scatter_update( self.variables.tree_thresholds, tree_update_indices, tree_threshold_updates) # TODO(thomaswc): Only update the epoch on the new leaves. new_epoch_updates = epoch * array_ops.ones_like(tree_threshold_updates, dtype=dtypes.int32) epoch_update_op = state_ops.scatter_update( self.variables.start_epoch, tree_update_indices, new_epoch_updates) # Update fertile slots. with ops.control_dependencies([tree_update_op]): (n2a_map_updates, a2n_map_updates, accumulators_cleared, accumulators_allocated) = (tensor_forest_ops.update_fertile_slots( finished, non_fertile_leaves, non_fertile_leaf_scores, self.variables.end_of_tree, self.variables.accumulator_sums, self.variables.node_to_accumulator_map, stale, self.variables.node_sums, regression=self.params.regression)) # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has # used it to calculate new leaves. gated_new_eot, = control_flow_ops.tuple( [new_eot], control_inputs=[n2a_map_updates]) eot_update_op = state_ops.assign(self.variables.end_of_tree, gated_new_eot) updates = [] updates.append(eot_update_op) updates.append(tree_update_op) updates.append(thresholds_update_op) updates.append(epoch_update_op) updates.append( state_ops.scatter_update(self.variables.node_to_accumulator_map, n2a_map_updates[0], n2a_map_updates[1])) updates.append( state_ops.scatter_update(self.variables.accumulator_to_node_map, a2n_map_updates[0], a2n_map_updates[1])) cleared_and_allocated_accumulators = array_ops.concat( [accumulators_cleared, accumulators_allocated], 0) # Calculate values to put into scatter update for candidate counts. # Candidate split counts are always reset back to 0 for both cleared # and allocated accumulators. This means some accumulators might be doubly # reset to 0 if the were released and not allocated, then later allocated. split_values = array_ops.tile( array_ops.expand_dims(array_ops.expand_dims( array_ops.zeros_like(cleared_and_allocated_accumulators, dtype=dtypes.float32), 1), 2), [1, self.params.num_splits_to_consider, self.params.num_output_columns]) updates.append(state_ops.scatter_update( self.variables.candidate_split_sums, cleared_and_allocated_accumulators, split_values)) if self.params.regression: updates.append(state_ops.scatter_update( self.variables.candidate_split_squares, cleared_and_allocated_accumulators, split_values)) # Calculate values to put into scatter update for total counts. total_cleared = array_ops.tile( array_ops.expand_dims( math_ops.negative(array_ops.ones_like(accumulators_cleared, dtype=dtypes.float32)), 1), [1, self.params.num_output_columns]) total_reset = array_ops.tile( array_ops.expand_dims( array_ops.zeros_like(accumulators_allocated, dtype=dtypes.float32), 1), [1, self.params.num_output_columns]) accumulator_updates = array_ops.concat([total_cleared, total_reset], 0) updates.append(state_ops.scatter_update( self.variables.accumulator_sums, cleared_and_allocated_accumulators, accumulator_updates)) if self.params.regression: updates.append(state_ops.scatter_update( self.variables.accumulator_squares, cleared_and_allocated_accumulators, accumulator_updates)) # Calculate values to put into scatter update for candidate splits. split_features_updates = array_ops.tile( array_ops.expand_dims( math_ops.negative(array_ops.ones_like( cleared_and_allocated_accumulators)), 1), [1, self.params.num_splits_to_consider]) updates.append(state_ops.scatter_update( self.variables.candidate_split_features, cleared_and_allocated_accumulators, split_features_updates)) updates += self.finish_iteration() return control_flow_ops.group(*updates)
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize or a callable taking no arguments which returns the value to minimize. When eager execution is enabled it must be a callable. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. RuntimeError: If called with eager execution enabled and `loss` is not callable. @compatibility(eager) When eager execution is enabled, `gate_gradients`, `aggregation_method`, and `colocate_gradients_with_ops` are ignored. @end_compatibility """ if callable(loss): with backprop.GradientTape() as tape: if var_list is not None: tape.watch(var_list) loss_value = loss() if var_list is None: var_list = tape.watched_variables() # TODO(jhseu): Figure out why GradientTape's gradients don't require loss # to be executed. with ops.control_dependencies([loss_value]): grads = tape.gradient(loss_value, var_list, grad_loss) return list(zip(grads, var_list)) # Non-callable/Tensor loss case if context.executing_eagerly(): raise RuntimeError( "`loss` passed to Optimizer.compute_gradients should " "be a function when eager execution is enabled.") if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # pylint: enable=protected-access processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes( [v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource]) return grads_and_vars
def body_wrapper(*inputs): """Wrapper around `body` that handles infeed queues and control deps.""" inputs = list(inputs) # Discards the dummy output added for arity-0 loops. if input_arity == 0: inputs = [] # Runs `body` with the dequeue_ops appended. if infeed_queue: number_of_shards = tpu_function.get_tpu_context().number_of_shards if number_of_shards is None: raise ValueError("Can't build training loop with infeed when there is " "no tpu_shard_context. Are you building a loop or " "graph directly rather than from inside tpu.rewrite, " "tpu.batch_parallel, tpu.shard, or tpu.replicate?") infeed_queue.set_number_of_shards(number_of_shards) dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()] else: dequeue_ops = [] outputs = body(*(inputs + dequeue_ops)) # If the computation only returned one value, make it a tuple. if not isinstance(outputs, (list, tuple)): outputs = (outputs,) outputs = [ o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o) for o in outputs ] # Separates the returned Operations and Tensors. output_operations = [o for o in outputs if isinstance(o, ops.Operation)] output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)] if outputs != output_tensors + output_operations: raise ValueError( "TPU training loop body must return zero or more Tensor values " "followed by zero or more Operations.") output_types = [op.dtype for op in output_tensors] if input_types != output_types: raise TypeError( "Mismatch between input types and output types for training loop " "body: {} vs {}".format(input_types, output_types)) # Add the dequeue operations to output_operations to ensure they are run # by the loop, even if the programmer's loop body does not use them. output_operations += dequeue_ops # Add a dummy output, if needed. if not output_tensors: output_tensors = array_ops.constant(0) if output_operations: # TODO(phawkins): in principle this is too restrictive since it serializes # the training loop steps. In practice it does not matter since this loop # will be compiled by XLA. output_tensors = control_flow_ops.tuple(output_tensors, control_inputs=output_operations) if tensor_tracer.TensorTracer.is_enabled(): num_replicas = tpu_function.get_tpu_context().number_of_shards if num_replicas is None: num_replicas = 1 tt = tensor_tracer.TensorTracer() output_tensors = tt.trace_tpu(ops.get_default_graph(), output_tensors, None, num_replicas) return output_tensors
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. RuntimeError: If called with eager execution enabled and if `grad_loss` is not `None` or `loss` is not callable. @compatibility(eager) When eager execution is enabled, `loss` should be a Python function that takes elements of `var_list` as arguments and computes the value to be minimized. If `var_list` is None, `loss` should take no arguments. Gradient computation is done with respect to the elements of `var_list` if not None, else with respect to any trainable variables created during the execution of the `loss` function. `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and `grad_loss` are ignored when eager execution is enabled. @end_compatibility """ if context.in_eager_mode(): if grad_loss is not None: raise RuntimeError( "`grad_loss` argument to Optimizer.compute_gradients " "not supported when eager execution is enabled.") if not callable(loss): raise RuntimeError( "`loss` passed to Optimizer.compute_gradients should " "be a function when eager execution is enabled.") # TODO (agarwal): consider passing parameters to the `loss` function. id:2636 gh:2637 if var_list is None: return backprop.implicit_grad(loss)() else: var_list = nest.flatten(var_list) grads = backprop.gradients_function(loss)(*var_list) grads_and_vars = list(zip(grads, var_list)) return grads_and_vars if gate_gradients not in [ Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH ]: raise ValueError( "gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # pylint: enable=protected-access processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([ v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource ]) return grads_and_vars
def _GradientsHelper(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE, src_graph=None): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError("tf.gradients is not supported when eager execution " "is enabled. Use tf.GradientTape instead.") if src_graph is None: src_graph = ops.get_default_graph() try: unconnected_gradients = UnconnectedGradients(unconnected_gradients) except ValueError: raise ValueError( "Unknown value for unconnected_gradients: %r" % unconnected_gradients) # If src_graph is a _FuncGraph (i.e. a function body), gather it and all # ancestor graphs. This is necessary for correctly handling captured values. func_graphs = [] curr_graph = src_graph while _IsFunction(curr_graph): func_graphs.append(curr_graph) if isinstance(curr_graph, FuncGraph): curr_graph = curr_graph.outer_graph else: assert isinstance(curr_graph, framework_function._FuncGraph) # pylint: disable=protected-access curr_graph = curr_graph._outer_graph # pylint: disable=protected-access ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: # Get a uid for this call to gradients that can be used to help # cluster ops for compilation. gradient_uid = ops.get_default_graph().unique_name("uid") ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices( xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops, gradient_uid) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] reachable_to_ops, pending_count, loop_state = _PendingCount( to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. ready = (pending_count[op] == 0) if ready and op not in to_ops_set and op in reachable_to_ops: to_ops_set.add(op) queue.append(op) if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None func_call = None is_partitioned_call = _IsPartitionedCall(op) # pylint: disable=protected-access is_func_call = ( src_graph._is_function(op.type) or is_partitioned_call) # pylint: enable=protected-access has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op not in stop_ops): try: grad_fn = ops.get_gradient_function(op) except LookupError: if is_func_call: if is_partitioned_call: func_call = src_graph._get_function( # pylint: disable=protected-access compat.as_bytes(op.get_attr("f").name)) else: func_call = src_graph._get_function(op.type) # pylint: disable=protected-access # Note that __defun is not set if the graph is # imported. If it's set, we prefer to access the original # defun. func_call = getattr(op, "__defun", func_call) grad_fn = func_call.python_grad_func else: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) # NOTE(skyewm): We don't support computing gradients wrt a loop variable # unless it's within the context of a single iteration (i.e. the # gradient is wrt to the loop parameter in the body function, not wrt or # through the initial value). This means if we're in a while loop # context, we should never see a switch node from this context. # pylint: disable=protected-access if (control_flow_util.IsSwitch(op) and op._control_flow_context is not None and op._control_flow_context.IsWhileContext() and op._control_flow_context == ops.get_default_graph()._get_control_flow_context()): _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs) # pylint: enable=protected-access if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with src_graph._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len([x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops._colocate_with_for_gradient( # pylint: disable=protected-access None, gradient_uid, ignore_existing=True): in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(_NonEagerInputs(op, xs)) for i, (t_in, in_grad) in enumerate(zip(_NonEagerInputs(op, xs), in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state, xs) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x, unconnected_gradients) for x in xs]