def __init__(self, params): super(BranchLayer, self).__init__(params) p = self.params assert p.name with tf.variable_scope(p.name): self.CreateChild('body', p.body)
def __init__(self, params): super(StackStep, self).__init__(params) p = params with tf.variable_scope(p.name): self.sub_steps = [] self.CreateChildren('sub', p.sub)
def Apply(self, lr, var_grad): """For each optimizer, apply the gradient to the variable. Args: lr: A scalar. The base learning rate. var_grad: A `.NestedMap` of (var, grad) pairs. Returns: The variable update op. Raises: Exception: When the regex overlaps with or does not cover all variables. """ # Override inherited GetOptimizer even though learning rate is unused. tf_optimizer_map = self.GetOptimizer(0) var_grad_map = {regex: [] for regex in self._optimizer_map} for (v, g) in var_grad.Flatten(): regex_match = 0 for regex in self._optimizer_map: if re.match(regex, v.name): var_grad_map[regex].append((g, v)) regex_match += 1 if regex_match == 0: var_grad_map['default_optimizer'].append((g, v)) if regex_match > 1: raise Exception( 'Variable {} is matched {} times by regex {}'.format( v.name, regex_match, list(self._optimizer_map.keys()))) def _Apply(): """Use the matched optimizer to apply the gradients.""" train_ops = [] non_default_regex = [ regex for regex in self._optimizer_map if regex != 'default_optimizer' ] for regex in self._optimizer_map: if var_grad_map[regex]: opt = tf_optimizer_map[regex] train_ops.append(opt.apply_gradients(var_grad_map[regex])) # pylint: disable=cell-var-from-loop, g-long-lambda if regex == 'default_optimizer': filtered_var_grad = var_grad.FilterKeyVal( lambda k, v: any([ re.match(i, v.var.name) for i in non_default_regex ])) else: filtered_var_grad = var_grad.FilterKeyVal( lambda k, v: (re.match(regex, v.var.name))) # pylint: enable=cell-var-from-loop, g-long-lambda self._optimizer_map[regex].AddSummary( self._lr_map[regex], opt, filtered_var_grad) return tf.group(*train_ops, name='composite_optimizer_train_op') if not py_utils.use_resource_variables(): var_update_op = _Apply() else: # Many optimizers, e.g., Adam, Adagrad, etc., create # variables. We need to ensure name scope and variable scope are # cleared. Otherwise, tpu.batch_parallel does not work. var_reuse = False if py_utils.GetOpportunisticVariableReuse(): var_reuse = tf.AUTO_REUSE with tf.name_scope(None): with tf.variable_scope( tf.VariableScope(use_resource=True, reuse=var_reuse)): var_update_op = _Apply() return var_update_op
def __init__(self, params): super(MTEncoderV1, self).__init__(params) p = self.params assert not p.packed_input, ('Packed inputs are not yet supported for ' 'MTEncoderV1.') with tf.variable_scope(p.name): if p.cc_schedule is not None: self.CreateChild('cc_schedule', p.cc_schedule) self.CreateChild('emb', p.emb) rnn_layers_params = [] # L0 is a bi-directional lstm. # L0's forward lstm cell if p.lstm_tpl_bidi is None: params = p.lstm_tpl.Copy() else: params = p.lstm_tpl_bidi.Copy() params.name = 'L0_rnn_fwd' params.num_input_nodes = p.emb.embedding_dim params.num_output_nodes = p.lstm_cell_size forward_lstm = params # L0's backward lstm cell params = params.Copy() params.name = 'L0_rnn_bak' backward_lstm = params # L0 layer. params = model_helper.CreateBidirectionalRNNParams( self.params, forward_lstm, backward_lstm) params.name = 'L0' rnn_layers_params.append(params) # The latter layers are all uni-directional lstm. input_size = 2 * p.lstm_cell_size for i in range(1, p.num_lstm_layers): # Forward lstm cell. if p.lstm_tpl_uni is None: cell = p.lstm_tpl.Copy() else: cell = p.lstm_tpl_uni.Copy() cell.name = 'L%d_rnn' % i cell.num_input_nodes = input_size cell.num_output_nodes = p.lstm_cell_size # Forward lstm layer. params = model_helper.CreateUnidirectionalRNNParams(self.params, cell) params.name = 'L%d' % i rnn_layers_params.append(params) input_size = p.lstm_cell_size self.CreateChildren('rnn', rnn_layers_params) dropout_p = layers.DropoutLayer.Params().Set( name='dropout_layer', keep_prob=1.0 - p.dropout_prob, random_seed=p.random_seed + 84828474 if p.random_seed else None) self.CreateChild('dropout', dropout_p)
def __init__(self, params): super(StatelessLayerStep, self).__init__(params) p = params with tf.variable_scope(p.name): self.CreateChild('layer', p.layer)
def __init__(self, params): super(AsrEncoder, self).__init__(params) p = self.params name = p.name with tf.variable_scope(name): # Use specAugment or not. if p.use_specaugment: self.CreateChild('specaugment', p.specaugment_network.Copy()) # First create the conv layers. assert p.num_cnn_layers == len(p.conv_filter_shapes) assert p.num_cnn_layers == len(p.conv_filter_strides) params_conv_layers = [] for i in range(p.num_cnn_layers): conv_p = p.cnn_tpl.Copy() conv_p.name = 'conv_L%d' % (i) conv_p.filter_shape = p.conv_filter_shapes[i] conv_p.filter_stride = p.conv_filter_strides[i] conv_p.is_eval = p.is_eval params_conv_layers.append(conv_p) self.CreateChildren('conv', params_conv_layers) conv_output_shape = p.input_shape for i in range(p.num_cnn_layers): conv_output_shape = self.conv[i].OutShape(conv_output_shape) assert len( conv_output_shape) == 4 # batch, height, width, channel. params_conv_lstm_rnn = [] params_conv_lstm_cnn = [] for i in range(p.num_conv_lstm_layers): # NOTE(yonghui): We assume that output from ConvLSTMBlock has the same # shape as its input. _, _, width, in_channel = conv_output_shape f_conv_lstm_p = p.conv_lstm_tpl.Copy() f_conv_lstm_p.name = 'f_conv_lstm_%d' % (i) f_conv_lstm_p.inputs_shape = [None, 1, width, in_channel] f_conv_lstm_p.cell_shape = [None, 1, width, in_channel] b_conv_lstm_p = f_conv_lstm_p.Copy() b_conv_lstm_p.name = 'b_conv_lstm_%d' % (i) conv_lstm_rnn_p = self.CreateConvLstmLayerParams() conv_lstm_rnn_p.name = 'conv_lstm_rnn' conv_lstm_rnn_p.fwd = f_conv_lstm_p conv_lstm_rnn_p.bak = b_conv_lstm_p params_conv_lstm_rnn.append(conv_lstm_rnn_p) cnn_p = p.after_conv_lstm_cnn_tpl.Copy() cnn_p.name = 'conv_lstm_cnn_%d' % (i) cnn_p.filter_shape[2] = 2 * in_channel cnn_p.filter_shape[3] = in_channel params_conv_lstm_cnn.append(cnn_p) # TODO(yonghui): Refactor ConvLSTMBlock into a layer. self.CreateChildren('conv_lstm_rnn', params_conv_lstm_rnn) self.CreateChildren('conv_lstm_cnn', params_conv_lstm_cnn) (self._first_lstm_input_dim, self._first_lstm_input_dim_pad ) = self.FirstLstmLayerInputDimAndPadding(conv_output_shape, pad_to_multiple=16) # Now create all the rnn layers and projection layers. # TODO(yonghui): take care of device placement. params_rnn_layers = [] params_proj_layers = [] params_highway_skip_layers = [] output_dim = self._first_lstm_input_dim for i in range(p.num_lstm_layers): input_dim = output_dim forward_p = p.lstm_tpl.Copy() forward_p.name = 'fwd_rnn_L%d' % (i) forward_p.num_input_nodes = input_dim forward_p.num_output_nodes = p.lstm_cell_size backward_p = forward_p.Copy() backward_p.name = 'bak_rnn_L%d' % (i) rnn_p = self.CreateBidirectionalRNNParams( forward_p, backward_p) rnn_p.name = 'brnn_L%d' % (i) params_rnn_layers.append(rnn_p) output_dim = 2 * p.lstm_cell_size if p.project_lstm_output and (i < p.num_lstm_layers - 1): proj_p = p.proj_tpl.Copy() proj_p.input_dim = 2 * p.lstm_cell_size proj_p.output_dim = 2 * p.lstm_cell_size proj_p.name = 'proj_L%d' % (i) proj_p.is_eval = p.is_eval params_proj_layers.append(proj_p) # add the skip layers residual_index = i - p.residual_start + 1 if p.residual_start > 0 and residual_index >= 0 and p.highway_skip: highway_skip = p.highway_skip_tpl.Copy() highway_skip.name = 'enc_hwskip_%d' % len( params_highway_skip_layers) highway_skip.input_dim = 2 * p.lstm_cell_size params_highway_skip_layers.append(highway_skip) # Adds the stacking layer. if p.layer_index_before_stacking == i: stacking_layer = p.stacking_layer_tpl.Copy() stacking_layer.name = 'stacking_%d' % (i) self.CreateChild('stacking', stacking_layer) stacking_window_len = (p.stacking_layer_tpl.left_context + 1 + p.stacking_layer_tpl.right_context) output_dim *= stacking_window_len self.CreateChildren('rnn', params_rnn_layers) self.CreateChildren('proj', params_proj_layers) self.CreateChildren('highway_skip', params_highway_skip_layers)
def __init__(self, params): assert issubclass(params.cls, BaseTask) # Ensure global_step exists before calling super. py_utils.GetOrCreateGlobalStepVar() super().__init__(params) p = self.params self._encoder = None self._online_encoder = None self._decoder = None self._loss = None self._num_predictions = None self._train_op = None self._post_train_ops = [] self._eval_metrics = {} self._per_example = {} # Create the gradient mask, self._per_input_gradient_mask = None if p.task_global_step: with tf.name_scope(None), tf.variable_scope( py_utils.GetGlobalVariableScope()): var_name = p.name + '_global_step' # Create the variable immediately. self._CreateVariableInternal( var_name, base_layer.CreateVariableMeta( var_params=py_utils.WeightParams( [], py_utils.WeightInit.Constant(0), tf.int64), theta_fn=None, kwargs=dict( trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES]))) summary_utils.scalar(var_name, self._private_vars[var_name]) self._global_step_var = self._private_vars[var_name] else: self._global_step_var = py_utils.GetOrCreateGlobalStepVar() if p.input: # TODO(zhifengc): Consider a simpler way to ensure the input # generator stops after one epoch. if self.do_eval and p.eval: seq_inp = issubclass(p.input.cls, base_input_generator.BaseInputGeneratorFromFiles) if p.input.num_samples > 0: if (p.eval.samples_per_summary == 0) or (p.input.num_samples < p.eval.samples_per_summary): p.eval.samples_per_summary = p.input.num_samples # If we know the dataset size and we want to evaluate the full # set, we need to coordinate the input generator to flush out # all samples so the evaler and decoder compute metrics on the # whole set for each summary step. if seq_inp: p.input.flush_every_n = p.input.num_samples if p.eval.decoder_samples_per_summary is not None and ( p.eval.decoder_samples_per_summary > p.input.num_samples): p.eval.decoder_samples_per_summary = p.input.num_samples if p.input.eval_samples_per_summary is not None: p.eval.samples_per_summary = p.input.eval_samples_per_summary if p.input.decoder_samples_per_summary is not None: p.eval.decoder_samples_per_summary = ( p.input.decoder_samples_per_summary) if p.input.num_samples == 0 and not p.input.resettable: # Dataset size is unknown. Computes eval summary based on num_samples. # We require static dataset size for non-resettable inputs. assert p.eval.samples_per_summary > 0 if seq_inp and p.input.num_batcher_threads > 1: tf.logging.warning( 'input.num_batcher_threads > 1 inside eval mode. ' 'The input generator may not iterate over exactly ' 'one epoch per run') tf.logging.info('input_params: %s', p.input) input_params = self.cluster.PlaceInput(p.input) # For TPU training, we create the input generator in a # different scope and AddChild it in later. if 'skip_create_child' not in p.input: self.CreateChild('input', input_params) tp = p.train # p.train can be None if this task is the teacher/student task in a # DistillationTask. if tp: self._SetLearnerFromLegacyParams(tp) if tp.learner is not None: if isinstance(tp.learner, (list, tuple)): self.CreateChildren('learners', tp.learner) else: self.CreateChildren('learners', [tp.learner]) self._UpdateVnConfig()
def __init__(self, params): super(PointDetectorBase, self).__init__(params) p = self.params self._utils_3d = detection_3d_lib.Utils3D() with tf.variable_scope(p.name): self.CreateChild('output_decoder', p.output_decoder)
def __init__(self, params): super(RnnStep, self).__init__(params) p = params with tf.variable_scope(p.name): self.CreateChild('cell', p.cell)
def _CreateChildrenVariables(self): if self.params.shared_emb: with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE): self.softmax.InstantiateVariables() super()._CreateChildrenVariables()
def __init__(self, params): super(TestTask, self).__init__(params) p = self.params with tf.variable_scope(p.name): self.CreateChild('encoder', p.encoder)
def _resource_apply_dense(self, grad, var): if grad is None: tf.logging.warning('Gradient is None for variable %s' % var.name) return [] grad_dtype = var.dtype # TODO(lepikhin): add to params grad = tf.cast(grad, grad_dtype) factored_dims = self._factored_dims(var.shape.as_list()) if factored_dims: vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') else: v = self.get_slot(var, 'v') if self._beta1: m = self.get_slot(var, 'm') cond = tf.constant(True) def _Upd(c, x): if not self._cond_is_finite: return c c = tf.math.logical_and(c, tf.reduce_all(tf.math.is_finite(x))) c = tf.math.logical_and( c, tf.reduce_all(tf.math.logical_not(tf.math.is_inf(x)))) return c def _Wrap(fn, x, y): if not self._cond_is_finite: return fn(x, y) return tf.cond(cond, lambda: fn(x, y), lambda: x) with tf.variable_scope(var.name[:-2] + '/Adafactor'): grad_squared = tf.math.square(grad) + tf.cast( self._epsilon1, grad_dtype) cond = _Upd(cond, grad_squared) decay_rate = tf.cast(self._decay_rate, var.dtype) old_val = tf.identity( var) # TODO(lepikhin): introduce gradient dtype if self._multiply_by_parameter_scale: update_scale = self._parameter_scale(old_val) * tf.cast( self._learning_rate, grad_dtype) else: update_scale = self._learning_rate mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype) update_scale = tf.cast(update_scale, grad_dtype) updates = [] if factored_dims: d0, d1 = factored_dims vr_axis, vc_axis = d0, d1 grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis) grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis) # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate cond = _Upd(cond, new_vr) cond = _Upd(cond, new_vc) vr_update = _Wrap(tf.assign, vr, new_vr) vc_update = _Wrap(tf.assign, vc, new_vc) updates.extend([vr_update, vc_update]) long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True) r_factor = tf.math.rsqrt(new_vr / long_term_mean) c_factor = tf.math.rsqrt(new_vc) x = grad * tf.expand_dims(r_factor, vr_axis) * tf.expand_dims( c_factor, vc_axis) else: new_v = v * decay_rate + grad_squared * mixing_rate cond = _Upd(cond, new_v) v_update = _Wrap(tf.assign, v, new_v) updates.append(v_update) x = grad * tf.math.rsqrt(new_v) if self._clipping_threshold is not None: clipping_denom = tf.maximum( tf.constant(1.0, grad_dtype), _ReduceRms(x) / tf.constant(self._clipping_threshold, grad_dtype)) x /= clipping_denom subtrahend = x * update_scale if self._beta1: new_m = (m * tf.constant(self._beta1, dtype=grad_dtype) + subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype)) subtrahend = new_m cond = _Upd(cond, new_m) updates.append(_Wrap(tf.assign, m, new_m)) # It is critical to use assign_sub instead of tf.assign(var - subtrahend) # for the case of bfloat16 activations, so as to avoid repeatedly # rounding the slice value, which results in poor quality. cond = _Upd(cond, subtrahend) var_update = _Wrap(tf.assign_sub, var, subtrahend) updates.append(var_update) return tf.group(*updates)
def try_apply_dense(self, grad, var): assert grad is not None cond = tf.constant(True) is_finite_checks = [] stats = {} grad_dtype = var.dtype # TODO(lepikhin): add to params grad = tf.cast(grad, grad_dtype) factored_dims = self._factored_dims(var.shape.as_list()) if factored_dims: vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') else: v = self.get_slot(var, 'v') if self._beta1: m = self.get_slot(var, 'm') def _Upd(c, k, x): stats[k] = x is_finite_checks.append(tf.reduce_all(tf.math.is_finite(x))) return c with tf.variable_scope(var.name[:-2] + '/Adafactor'): grad_squared = tf.math.square(grad) + tf.cast( self._epsilon1, grad_dtype) cond = _Upd(cond, 'grad_squared', grad_squared) # 0 (factored) decay_rate = tf.cast(self._decay_rate, var.dtype) old_val = tf.identity( var) # TODO(lepikhin): introduce gradient dtype assert self._multiply_by_parameter_scale if self._multiply_by_parameter_scale: parameter_scale = self._parameter_scale(old_val) cond = _Upd(cond, 'parameter_scale', parameter_scale) # 1 (factored) update_scale = self._parameter_scale(old_val) * tf.cast( self._learning_rate, grad_dtype) else: update_scale = self._learning_rate mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype) update_scale = tf.cast(update_scale, grad_dtype) if factored_dims: d0, d1 = factored_dims vr_axis, vc_axis = d0, d1 grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis) grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis) # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate cond = _Upd(cond, 'new_vr', new_vr) # 2 (factored) cond = _Upd(cond, 'new_vc', new_vc) # 3 (factored) # vr_update = _Wrap(tf.assign, vr, new_vr) # vc_update = _Wrap(tf.assign, vc, new_vc) # updates.extend([vr_update, vc_update]) long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True) r_factor = tf.math.rsqrt(new_vr / long_term_mean) c_factor = tf.math.rsqrt(new_vc) mult = tf.expand_dims(r_factor, vr_axis) * tf.expand_dims( c_factor, vc_axis) cond = _Upd(cond, 'mult', mult) # 4 (factored) x = grad * mult else: new_v = v * decay_rate + grad_squared * mixing_rate cond = _Upd(cond, 'new_v', new_v) # v_update = _Wrap(tf.assign, v, new_v) # updates.append(v_update) x = grad * tf.math.rsqrt(new_v) assert self._clipping_threshold is not None if self._clipping_threshold is not None: clipping_denom = tf.maximum( tf.constant(1.0, grad_dtype), _ReduceRms(x) / tf.constant(self._clipping_threshold, grad_dtype)) x /= clipping_denom cond = _Upd(cond, 'x', x) subtrahend = x * update_scale if self._beta1: new_m = (m * tf.constant(self._beta1, dtype=grad_dtype) + subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype)) subtrahend = new_m cond = _Upd(cond, 'new_m', new_m) # updates.append(_Wrap(tf.assign, m, new_m)) # It is critical to use assign_sub instead of tf.assign(var - subtrahend) # for the case of bfloat16 activations, so as to avoid repeatedly # rounding the slice value, which results in poor quality. cond = _Upd(cond, 'subtrahend', subtrahend) # 5 (factored) # var_update = _Wrap(tf.assign_sub, var, subtrahend) # updates.append(var_update) return is_finite_checks, stats
def __init__(self, params): super(PointsToGridFeaturizer, self).__init__(params) p = self.params with tf.variable_scope(p.name): self.CreateChild('featurizer', p.featurizer)
def __init__(self, params): super(BatchParallelLayer, self).__init__(params) p = self.params assert p.name with tf.variable_scope(p.name): self.CreateChild('sub', p.sub)
def __init__(self, params): super(ParallelStep, self).__init__(params) p = params with tf.variable_scope(p.name): self.CreateChildren('sub', p.sub)
def _CreateChildrenVariables(self): with tf.variable_scope(self.params.name): with py_utils.VariableShapePrefixContext(self.params.repeat): self.body.InstantiateVariables() super()._CreateChildrenVariables()
def __init__(self, params): super(MTEncoderBiRNN, self).__init__(params) p = self.params with tf.variable_scope(p.name): if p.cc_schedule is None: self.cc_schedule = None else: self.CreateChild('cc_schedule', p.cc_schedule) self.CreateChild('emb', p.emb) rnn_layers_params = [] for i in range(p.num_lstm_layers): params = p.lstm_tpl.Copy() params.name = 'L%d_rnn_fwd' % i if i == 0: params.num_input_nodes = p.emb.embedding_dim else: params.num_input_nodes = 2 * p.lstm_cell_size params.num_output_nodes = p.lstm_cell_size params.reset_cell_state = p.packed_input forward_lstm = params params = params.Copy() params.name = 'L%d_rnn_bak' % i params.reset_cell_state = p.packed_input backward_lstm = params params = model_helper.CreateBidirectionalRNNParams( self.params, forward_lstm, backward_lstm) params.packed_input = p.packed_input params.name = 'L%d' % i rnn_layers_params.append(params) self.CreateChildren('rnn', rnn_layers_params) if p.lstm_cell_size * 2 != p.encoder_out_dim: # Project the encoder output to the desired dim. proj_p = p.proj_tpl.Copy().Set( name='proj', batch_norm=False, input_dim=p.lstm_cell_size * 2, output_dim=p.encoder_out_dim) if p.cc_schedule is not None: proj_p.has_bias = False proj_p.activation = 'TANH' else: proj_p.has_bias = True proj_p.activation = 'NONE' self.CreateChild('final_proj', proj_p) dropout_p = layers.DropoutLayer.Params().Set( name='dropout_layer', keep_prob=1.0 - p.dropout_prob, random_seed=p.random_seed + 827366448 if p.random_seed else None) self.CreateChild('dropout', dropout_p) if p.is_transparent: transparent_params = p.transparent_merger_tpl.Copy() transparent_params.name = 'transparent' transparent_params.num_sources = p.num_lstm_layers self.CreateChild('transparent_merger', transparent_params)
def __init__(self, params): super(InsertionModel, self).__init__(params) p = self.params with tf.variable_scope(p.name): self.CreateChild('insertion', p.insertion)
def _CreateLayerVariables(self): # Save a scope for lazily created variables. with tf.variable_scope('q'): self._qvars_scope = tf.get_variable_scope()