def _create_var(name: str, value_expr: TfExpression) -> TfExpression: """Internal helper for creating autosummary accumulators.""" assert not _finalized name_id = name.replace("/", "_") v = tf.cast(value_expr, _dtype) if v.shape.is_fully_defined(): size = np.prod(tfutil.shape_to_list(v.shape)) size_expr = tf.constant(size, dtype=_dtype) else: size = None size_expr = tf.reduce_prod(tf.cast(tf.shape(v), _dtype)) if size == 1: if v.shape.ndims != 0: v = tf.reshape(v, []) v = [size_expr, v, tf.square(v)] else: v = [size_expr, tf.reduce_sum(v), tf.reduce_sum(tf.square(v))] v = tf.cond(tf.is_finite(v[1]), lambda: tf.stack(v), lambda: tf.zeros(3, dtype=_dtype)) with tfutil.absolute_name_scope("Autosummary/" + name_id), tf.control_dependencies(None): var = tf.Variable(tf.zeros(3, dtype=_dtype), trainable=False) # [sum(1), sum(x), sum(x**2)] update_op = tf.cond(tf.is_variable_initialized(var), lambda: tf.assign_add(var, v), lambda: tf.assign(var, v)) if name in _vars: _vars[name].append(var) else: _vars[name] = [var] return update_op
def get_loss_scaling_var(self, device: str) -> Union[tf.Variable, None]: """Get or create variable representing log2 of the current dynamic loss scaling factor.""" if not self.use_loss_scaling: return None if device not in self._dev_ls_var: with tfutil.absolute_name_scope( self.scope + "/LossScalingVars"), tf.control_dependencies(None): self._dev_ls_var[device] = tf.Variable(np.float32( self.loss_scaling_init), name="loss_scaling_var") return self._dev_ls_var[device]
def setup_as_moving_average_of( self, src_net: "Network", beta: TfExpressionEx = 0.99, beta_nontrainable: TfExpressionEx = 0.0) -> tf.Operation: """Construct a TensorFlow op that updates the variables of this network to be slightly closer to those of the given network.""" with tfutil.absolute_name_scope(self.scope + "/_MovingAvg"): ops = [] for name, var in self.vars.items(): if name in src_net.vars: cur_beta = beta if name in self.trainables else beta_nontrainable new_value = tfutil.lerp(src_net.vars[name], var, cur_beta) ops.append(var.assign(new_value)) return tf.group(*ops)
def autosummary(name: str, value: TfExpressionEx, passthru: TfExpressionEx = None) -> TfExpressionEx: """Create a new autosummary. Args: name: Name to use in TensorBoard value: TensorFlow expression or python value to track passthru: Optionally return this TF node without modifications but tack an autosummary update side-effect to this node. Example use of the passthru mechanism: n = autosummary('l2loss', loss, passthru=n) This is a shorthand for the following code: with tf.control_dependencies([autosummary('l2loss', loss)]): n = tf.identity(n) """ tfutil.assert_tf_initialized() name_id = name.replace("/", "_") if tfutil.is_tf_expression(value): with tf.name_scope("summary_" + name_id), tf.device(value.device): update_op = _create_var(name, value) with tf.control_dependencies([update_op]): return tf.identity(value if passthru is None else passthru) else: # python scalar or numpy array if name not in _immediate: with tfutil.absolute_name_scope( "Autosummary/" + name_id), tf.device(None), tf.control_dependencies(None): update_value = tf.placeholder(_dtype) update_op = _create_var(name, update_value) _immediate[name] = update_op, update_value update_op, update_value = _immediate[name] tfutil.run(update_op, {update_value: value}) return value if passthru is None else passthru
def run( self, *in_arrays: Tuple[Union[np.ndarray, None], ...], input_transform: dict = None, output_transform: dict = None, return_as_list: bool = False, print_progress: bool = False, minibatch_size: int = None, num_gpus: int = 1, assume_frozen: bool = False, **dynamic_kwargs ) -> Union[np.ndarray, Tuple[np.ndarray, ...], List[np.ndarray]]: """Run this network for the given NumPy array(s), and return the output(s) as NumPy array(s). Args: input_transform: A dict specifying a custom transformation to be applied to the input tensor(s) before evaluating the network. The dict must contain a 'func' field that points to a top-level function. The function is called with the input TensorFlow expression(s) as positional arguments. Any remaining fields of the dict will be passed in as kwargs. output_transform: A dict specifying a custom transformation to be applied to the output tensor(s) after evaluating the network. The dict must contain a 'func' field that points to a top-level function. The function is called with the output TensorFlow expression(s) as positional arguments. Any remaining fields of the dict will be passed in as kwargs. return_as_list: True = return a list of NumPy arrays, False = return a single NumPy array, or a tuple if there are multiple outputs. print_progress: Print progress to the console? Useful for very large input arrays. minibatch_size: Maximum minibatch size to use, None = disable batching. num_gpus: Number of GPUs to use. assume_frozen: Improve multi-GPU performance by assuming that the trainable parameters will remain changed between calls. dynamic_kwargs: Additional keyword arguments to be passed into the network build function. """ assert len(in_arrays) == self.num_inputs assert not all(arr is None for arr in in_arrays) assert input_transform is None or util.is_top_level_function( input_transform["func"]) assert output_transform is None or util.is_top_level_function( output_transform["func"]) output_transform, dynamic_kwargs = _handle_legacy_output_transforms( output_transform, dynamic_kwargs) num_items = in_arrays[0].shape[0] if minibatch_size is None: minibatch_size = num_items # Construct unique hash key from all arguments that affect the TensorFlow graph. key = dict(input_transform=input_transform, output_transform=output_transform, num_gpus=num_gpus, assume_frozen=assume_frozen, dynamic_kwargs=dynamic_kwargs) def unwind_key(obj): if isinstance(obj, dict): return [(key, unwind_key(value)) for key, value in sorted(obj.items())] if callable(obj): return util.get_top_level_function_name(obj) return obj key = repr(unwind_key(key)) # Build graph. if key not in self._run_cache: with tfutil.absolute_name_scope( self.scope + "/_Run"), tf.control_dependencies(None): with tf.device("/cpu:0"): in_expr = [ tf.placeholder(tf.float32, name=name) for name in self.input_names ] in_split = list( zip(*[tf.split(x, num_gpus) for x in in_expr])) out_split = [] for gpu in range(num_gpus): with tf.device("/gpu:%d" % gpu): net_gpu = self.clone() if assume_frozen else self in_gpu = in_split[gpu] if input_transform is not None: in_kwargs = dict(input_transform) in_gpu = in_kwargs.pop("func")(*in_gpu, **in_kwargs) in_gpu = [in_gpu] if tfutil.is_tf_expression( in_gpu) else list(in_gpu) assert len(in_gpu) == self.num_inputs out_gpu = net_gpu.get_output_for(*in_gpu, return_as_list=True, **dynamic_kwargs) if output_transform is not None: out_kwargs = dict(output_transform) out_gpu = out_kwargs.pop("func")(*out_gpu, **out_kwargs) out_gpu = [out_gpu] if tfutil.is_tf_expression( out_gpu) else list(out_gpu) assert len(out_gpu) == self.num_outputs out_split.append(out_gpu) with tf.device("/cpu:0"): out_expr = [ tf.concat(outputs, axis=0) for outputs in zip(*out_split) ] self._run_cache[key] = in_expr, out_expr # Run minibatches. in_expr, out_expr = self._run_cache[key] out_arrays = [ np.empty([num_items] + tfutil.shape_to_list(expr.shape)[1:], expr.dtype.name) for expr in out_expr ] for mb_begin in range(0, num_items, minibatch_size): if print_progress: print("\r%d / %d" % (mb_begin, num_items), end="") mb_end = min(mb_begin + minibatch_size, num_items) mb_num = mb_end - mb_begin mb_in = [ src[mb_begin:mb_end] if src is not None else np.zeros([mb_num] + shape[1:]) for src, shape in zip(in_arrays, self.input_shapes) ] mb_out = tf.get_default_session().run(out_expr, dict(zip(in_expr, mb_in))) for dst, src in zip(out_arrays, mb_out): dst[mb_begin:mb_end] = src # Done. if print_progress: print("\r%d / %d" % (num_items, num_items)) if not return_as_list: out_arrays = out_arrays[0] if len(out_arrays) == 1 else tuple( out_arrays) return out_arrays
def _init_graph(self) -> None: # Collect inputs. self.input_names = [] for param in inspect.signature(self._build_func).parameters.values(): if param.kind == param.POSITIONAL_OR_KEYWORD and param.default is param.empty: self.input_names.append(param.name) self.num_inputs = len(self.input_names) assert self.num_inputs >= 1 # Choose name and scope. if self.name is None: self.name = self._build_func_name assert re.match("^[A-Za-z0-9_.\\-]*$", self.name) with tf.name_scope(None): self.scope = tf.get_default_graph().unique_name(self.name, mark_as_used=True) # Finalize build func kwargs. build_kwargs = dict(self.static_kwargs) build_kwargs["is_template_graph"] = True build_kwargs["components"] = self.components # Build template graph. with tfutil.absolute_variable_scope( self.scope, reuse=tf.AUTO_REUSE), tfutil.absolute_name_scope( self.scope): # ignore surrounding scopes assert tf.get_variable_scope().name == self.scope assert tf.get_default_graph().get_name_scope() == self.scope with tf.control_dependencies( None): # ignore surrounding control dependencies self.input_templates = [ tf.placeholder(tf.float32, name=name) for name in self.input_names ] out_expr = self._build_func(*self.input_templates, **build_kwargs) # Collect outputs. assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple) self.output_templates = [ out_expr ] if tfutil.is_tf_expression(out_expr) else list(out_expr) self.num_outputs = len(self.output_templates) assert self.num_outputs >= 1 assert all(tfutil.is_tf_expression(t) for t in self.output_templates) # Perform sanity checks. if any(t.shape.ndims is None for t in self.input_templates): raise ValueError( "Network input shapes not defined. Please call x.set_shape() for each input." ) if any(t.shape.ndims is None for t in self.output_templates): raise ValueError( "Network output shapes not defined. Please call x.set_shape() where applicable." ) if any(not isinstance(comp, Network) for comp in self.components.values()): raise ValueError( "Components of a Network must be Networks themselves.") if len(self.components) != len( set(comp.name for comp in self.components.values())): raise ValueError("Components of a Network must have unique names.") # List inputs and outputs. self.input_shapes = [ tfutil.shape_to_list(t.shape) for t in self.input_templates ] self.output_shapes = [ tfutil.shape_to_list(t.shape) for t in self.output_templates ] self.input_shape = self.input_shapes[0] self.output_shape = self.output_shapes[0] self.output_names = [ t.name.split("/")[-1].split(":")[0] for t in self.output_templates ] # List variables. self.own_vars = OrderedDict( (var.name[len(self.scope) + 1:].split(":")[0], var) for var in tf.global_variables(self.scope + "/")) self.vars = OrderedDict(self.own_vars) self.vars.update((comp.name + "/" + name, var) for comp in self.components.values() for name, var in comp.vars.items()) self.trainables = OrderedDict( (name, var) for name, var in self.vars.items() if var.trainable) self.var_global_to_local = OrderedDict( (var.name.split(":")[0], name) for name, var in self.vars.items())
def finalize_autosummaries() -> None: """Create the necessary ops to include autosummaries in TensorBoard report. Note: This should be done only once per graph. """ global _finalized tfutil.assert_tf_initialized() if _finalized: return None _finalized = True tfutil.init_uninitialized_vars( [var for vars_list in _vars.values() for var in vars_list]) # Create summary ops. with tf.device(None), tf.control_dependencies(None): for name, vars_list in _vars.items(): name_id = name.replace("/", "_") with tfutil.absolute_name_scope("Autosummary/" + name_id): moments = tf.add_n(vars_list) moments /= moments[0] with tf.control_dependencies([moments ]): # read before resetting reset_ops = [ tf.assign(var, tf.zeros(3, dtype=_dtype)) for var in vars_list ] with tf.name_scope(None), tf.control_dependencies( reset_ops): # reset before reporting mean = moments[1] std = tf.sqrt(moments[2] - tf.square(moments[1])) tf.summary.scalar(name, mean) tf.summary.scalar( "xCustomScalars/" + name + "/margin_lo", mean - std) tf.summary.scalar( "xCustomScalars/" + name + "/margin_hi", mean + std) # Group by category and chart name. cat_dict = OrderedDict() for series_name in sorted(_vars.keys()): p = series_name.split("/") cat = p[0] if len(p) >= 2 else "" chart = "/".join(p[1:-1]) if len(p) >= 3 else p[-1] if cat not in cat_dict: cat_dict[cat] = OrderedDict() if chart not in cat_dict[cat]: cat_dict[cat][chart] = [] cat_dict[cat][chart].append(series_name) # Setup custom_scalar layout. categories = [] for cat_name, chart_dict in cat_dict.items(): charts = [] for chart_name, series_names in chart_dict.items(): series = [] for series_name in series_names: series.append( layout_pb2.MarginChartContent.Series( value=series_name, lower="xCustomScalars/" + series_name + "/margin_lo", upper="xCustomScalars/" + series_name + "/margin_hi")) margin = layout_pb2.MarginChartContent(series=series) charts.append(layout_pb2.Chart(title=chart_name, margin=margin)) categories.append(layout_pb2.Category(title=cat_name, chart=charts)) layout = summary_lib.custom_scalar_pb( layout_pb2.Layout(category=categories)) return layout
def apply_updates(self) -> tf.Operation: """Construct training op to update the registered variables based on their gradients.""" tfutil.assert_tf_initialized() assert not self._updates_applied self._updates_applied = True devices = list(self._dev_grads.keys()) total_grads = sum(len(grads) for grads in self._dev_grads.values()) assert len(devices) >= 1 and total_grads >= 1 ops = [] with tfutil.absolute_name_scope(self.scope): # Cast gradients to FP32 and calculate partial sum within each device. dev_grads = OrderedDict() # device => [(grad, var), ...] for dev_idx, dev in enumerate(devices): with tf.name_scope("ProcessGrads%d" % dev_idx), tf.device(dev): sums = [] for gv in zip(*self._dev_grads[dev]): assert all(v is gv[0][1] for g, v in gv) g = [tf.cast(g, tf.float32) for g, v in gv] g = g[0] if len(g) == 1 else tf.add_n(g) sums.append((g, gv[0][1])) dev_grads[dev] = sums # Sum gradients across devices. if len(devices) > 1: with tf.name_scope("SumAcrossGPUs"), tf.device(None): for var_idx, grad_shape in enumerate(self._grad_shapes): g = [dev_grads[dev][var_idx][0] for dev in devices] if np.prod( grad_shape ): # nccl does not support zero-sized tensors g = nccl_ops.all_sum(g) for dev, gg in zip(devices, g): dev_grads[dev][var_idx] = ( gg, dev_grads[dev][var_idx][1]) # Apply updates separately on each device. for dev_idx, (dev, grads) in enumerate(dev_grads.items()): with tf.name_scope("ApplyGrads%d" % dev_idx), tf.device(dev): # Scale gradients as needed. if self.use_loss_scaling or total_grads > 1: with tf.name_scope("Scale"): coef = tf.constant(np.float32(1.0 / total_grads), name="coef") coef = self.undo_loss_scaling(coef) grads = [(g * coef, v) for g, v in grads] # Check for overflows. with tf.name_scope("CheckOverflow"): grad_ok = tf.reduce_all( tf.stack([ tf.reduce_all(tf.is_finite(g)) for g, v in grads ])) # Update weights and adjust loss scaling. with tf.name_scope("UpdateWeights"): # pylint: disable=cell-var-from-loop opt = self._dev_opt[dev] ls_var = self.get_loss_scaling_var(dev) if not self.use_loss_scaling: ops.append( tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op)) else: ops.append( tf.cond( grad_ok, lambda: tf.group( tf.assign_add(ls_var, self. loss_scaling_inc), opt.apply_gradients(grads)), lambda: tf.group( tf.assign_sub(ls_var, self. loss_scaling_dec)))) # Report statistics on the last device. if dev == devices[-1]: with tf.name_scope("Statistics"): ops.append( autosummary.autosummary( self.id + "/learning_rate", self.learning_rate)) ops.append( autosummary.autosummary( self.id + "/overflow_frequency", tf.where(grad_ok, 0, 1))) if self.use_loss_scaling: ops.append( autosummary.autosummary( self.id + "/loss_scaling_log2", ls_var)) # Initialize variables and group everything into a single op. self.reset_optimizer_state() tfutil.init_uninitialized_vars(list(self._dev_ls_var.values())) return tf.group(*ops, name="TrainingOp")