def __init__(self, params): super(TransformerBatchMajorEncoder, self).__init__(params) p = self.params assert p.output_data_format in ('TBC', 'BTC') if p.shared_emb: with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE): self.CreateChild('softmax', p.shared_emb) with tf.variable_scope(p.name): p.token_emb.dtype = p.dtype if not p.shared_emb: self.CreateChild('token_emb', p.token_emb) self.CreateChild('position_emb', p.position_emb) dropout_tpl = p.input_dropout_tpl.Copy() dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob) self.CreateChild('input_dropout', dropout_tpl) if p.transformer_stack: self.CreateChild('transformer_stack', p.transformer_stack) if p.final_layer_norm: layer_norm_p = layers.LayerNorm.Params().Set( name='final_ln', input_dim=p.model_dim, use_fused_layernorm=p.use_fused_layernorm, fprop_dtype=p.input_dropout_tpl.fprop_dtype) self.CreateChild('final_ln', layer_norm_p)
def __init__(self, params): super(DepthwiseConv2DLayer, self).__init__(params) p = self.params assert p.name w_pc = py_utils.WeightParams( shape=p.filter_shape, init=p.params_init, dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) with tf.variable_scope(p.name): self.CreateVariable('w', w_pc) if p.weight_norm: self.CreateVariable( 'g', py_utils.WeightParams( shape=[p.filter_shape[2], p.filter_shape[3]], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars'])) if p.bias: # NOTE(jiahuiyu): bias is subject to LP regularization in this version. self.CreateVariable( 'b', py_utils.WeightParams( shape=[self.output_channels], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']))
def __init__(self, params): super(AttentionBlockStep, self).__init__(params) p = self.params name = p.name with tf.variable_scope(name): self.CreateChild('query_generator', p.query_generator) self.CreateChild('attention', p.attention)
def __init__(self, params): super(StackedRevNetLayer, self).__init__(params) p = params assert p.name assert p.sub_layer_params with tf.variable_scope(p.name): self.CreateChildren('sub_layers', p.sub_layer_params)
def __init__(self, params): super(GPipeTransformerEmbeddingLayer, self).__init__(params) p = self.params with tf.variable_scope(p.name): p.token_emb.name = 'src_token_emb' p.position_emb.name = 'src_position_emb' self.CreateChild('src_token_emb', p.token_emb) self.CreateChild('src_pos_emb', p.position_emb) if p.enc_task_emb: self.CreateChild('src_task_emb', p.enc_task_emb) p.dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob) p.dropout_tpl.name = 'src_dropout' self.CreateChild('src_dropout', p.dropout_tpl) if p.add_tgt_embedding_layer: params = p.token_emb.Copy() if p.target_vocab_size: params.vocab_size = p.target_vocab_size params.name = 'tgt_token_emb' self.CreateChild('tgt_token_emb', params) params = p.position_emb.Copy() params.name = 'tgt_position_emb' self.CreateChild('tgt_pos_emb', params) if p.dec_task_emb: self.CreateChild('tgt_task_emb', p.dec_task_emb) params = p.dropout_tpl.Copy() params.keep_prob = (1.0 - p.input_dropout_prob) params.name = 'tgt_dropout' self.CreateChild('tgt_dropout', params) assert p.name
def __init__(self, params): super(GraphStep, self).__init__(params) p = self.params assert p.name with tf.variable_scope(p.name): self._seq = [] for i, (signature, external_signature, sub_params) in enumerate(p.sub): assert signature sig = builder_layers.GraphSignature(signature) assert len(sig.inputs) == 1 assert sig.outputs external_sig = None if external_signature: external_sig = builder_layers.GraphSignature( external_signature) assert len(external_sig.inputs) == 1 assert not external_sig.outputs name = sub_params.name if not name: name = '%s_%02d' % (sig.outputs[0], i) sub_params.name = name self.CreateChild(name, sub_params) self._seq.append( GraphStep._seq(name, sig, external_sig, self.children[name])) self.output_signature = builder_layers.GraphSignature( p.output_signature)
def __init__(self, params): super(RepeatLayer, self).__init__(params) p = self.params assert p.name assert p.repeat > 0 with tf.variable_scope(p.name): with py_utils.VariableShapePrefixContext(p.repeat): self.CreateChild('body', p.body)
def _CreateQStateVar(self, t_name, suffix, params): name = t_name + '_' + suffix assert name not in self._qvars, 'QState var already exists: %s' % ( name, ) var_name = self._qvars_scope.name + '/' + name with tf.variable_scope(py_utils.GetGlobalVariableScope()): _, v = py_utils.CreateVariable(var_name, params, trainable=False) self._qvars[name] = v return v
def __init__(self, params): super(RevNetLayer, self).__init__(params) p = params assert p.name assert p.f_params assert p.g_params with tf.variable_scope(p.name): self.CreateChild('f_block', p.f_params) self.CreateChild('g_block', p.g_params)
def __init__(self, params): super(ParallelLayer, self).__init__(params) p = self.params assert p.name self._seq = [] with tf.variable_scope(p.name): for sub in p.sub: self.CreateChild(sub.name, sub) self._seq.append((sub.name, self.children[sub.name]))
def __init__(self, params): super(TransformerEncoder, self).__init__(params) p = self.params if p.shared_emb: with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE): # Naming this 'softmax' to match the name of the same component in the # decoder. Variable names need to be the same in order to be reused. self.CreateChild('softmax', p.shared_emb) with tf.variable_scope(p.name): assert p.token_emb.embedding_dim == p.position_emb.embedding_dim p.transformer_stack.Set(model_dim=p.model_dim, packed_input=p.packed_input) if p.model_dim != p.token_emb.embedding_dim: tf.logging.warning( 'token_emb.embedding_dim != model_dim (%s vs. %s), ' 'creating a projection!') proj_p = layers.ProjectionLayer.Params().Copy() proj_p.name = 'emb_proj' proj_p.input_dim = p.token_emb.embedding_dim proj_p.output_dim = p.model_dim proj_p.batch_norm = True self.CreateChild('emb_proj', proj_p) # Token embeddings if not p.shared_emb: p.token_emb.dtype = p.dtype self.CreateChild('token_emb', p.token_emb) # Positional embeddings self.CreateChild('position_emb', p.position_emb) # Task embeddings. if p.task_emb: assert p.task_emb.embedding_dim == p.token_emb.embedding_dim self.CreateChild('task_emb', p.task_emb) dropout_tpl = layers.DropoutLayer.Params() dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob) self.CreateChild('input_dropout', dropout_tpl) p.transformer_stack.name = p.name self.CreateChild('transformer_stack', p.transformer_stack)
def __init__(self, params): super(PassiveAsymQDomain, self).__init__(params) p = self.params self._t_names = set() # set of known t_name (from CreateTensor) self._qvars = py_utils.NestedMap() # var_name -> tf.Variable # Save a scope for lazily created variables. with tf.variable_scope(p.name + '/q'): self._qvars_scope = tf.get_variable_scope()
def __init__(self, params): super(MTBaseModel, self).__init__(params) p = self.params with tf.variable_scope(p.name): with self._EncoderDevice(): if p.encoder: self.CreateChild('enc', p.encoder) with self._DecoderDevice(): self.CreateChild('dec', p.decoder)
def __init__(self, params): super(BatchNormLayer, self).__init__(params) p = self.params assert p.name pc = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) with tf.variable_scope(p.name): if not p.use_moving_avg_in_training: self.CreateVariable('beta', pc) if p.gamma_zero_init: # zero initialization to BN gamma self.CreateVariable('gamma', pc) else: # Note, The real gamma to use is 1 + gamma. self.CreateVariable('gamma', pc, lambda x: 1.0 + x) # Two statistics. moving_collections = [ 'moving_vars', self.__class__.__name__ + '_vars' ] if p.add_stats_to_moving_average_variables: moving_collections += [tf.GraphKeys.MOVING_AVERAGE_VARIABLES] elif p.add_stats_to_moving_average_variables is None: # TODO(rpang): force all models to set this param explicitly. tf.logging.warning( 'BatchNormLayer.add_stats_to_moving_average_variables should be ' 'set to True for new models, and to False explicitly for ' 'checkpoint compatibility.') # Add to the MOVING_AVERAGE_VARIABLES collection so that they are returned # by tf.moving_average_variables() and included in EMA variables if # ema_decay is enabled. mva = py_utils.WeightParams(shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=moving_collections) self.CreateVariable('moving_mean', mva, trainable=False, aggregation=tf.VariableAggregation.MEAN) mvv = py_utils.WeightParams(shape=[p.dim], init=py_utils.WeightInit.Constant(1.0), dtype=p.dtype, collections=moving_collections) self.CreateVariable('moving_variance', mvv, trainable=False, aggregation=tf.VariableAggregation.MEAN) self._epsilon = 0.001 self._decay = p.decay
def __init__(self, params): super(BiasLayer, self).__init__(params) p = self.params with tf.variable_scope(p.name): self.CreateVariable( 'b', py_utils.WeightParams( shape=[p.dims], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']))
def __init__(self, params): super(LinearLayer, self).__init__(params) p = self.params with tf.variable_scope(p.name): self.CreateVariable( 'w', py_utils.WeightParams( shape=[p.input_dims, p.output_dims], init=p.params_init, dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']))
def __init__(self, params): super(Learner, self).__init__(params) p = self.params self._var_grads = None self._eval_metrics = {} if p.grad_norm_tracker: # Use parent's name for backwards compatibility. with tf.variable_scope(self.parent.params.name): self.CreateChild('grad_norm_tracker', p.grad_norm_tracker) self.CreateChild('lr_schedule', p.lr_schedule) self.CreateChild('optimizer', p.optimizer)
def _Acc(vg): """Updating accumulators.""" v, g = vg with tf.variable_scope(v.op.name): _, a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return py_utils.VarGrad(v, a)
def __init__(self, params): super(IdentityRegressionTask, self).__init__(params) with tf.variable_scope('IdentityRegressionTask'): self.CreateVariable( 'm', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Uniform())) self.CreateVariable( 'b', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Uniform())) self.global_steps = [] self.metrics = [] self.result_per_example_tensors = []
def __init__(self, params): super(TransformerStack, self).__init__(params) p = self.params with tf.variable_scope(p.name): # Add transformer layers. transformer_layer_params = [] denom = 1 if isinstance(p.transformer_tpl, list): denom = len(p.transformer_tpl) assert p.num_transformer_layers % len(p.transformer_tpl) == 0 for i in range(p.num_transformer_layers // denom): if isinstance(p.transformer_tpl, list): for q in p.transformer_tpl: params = q.Copy() transformer_layer_params.append(params) else: params = p.transformer_tpl.Copy() transformer_layer_params.append(params) for i, params in enumerate(transformer_layer_params): params.name = 'trans_%d' % (i) params.source_dim = p.model_dim params.packed_input = p.packed_input params.has_aux_atten = p.has_aux_attention params.mask_self_atten = p.mask_self_atten self.CreateChildren('trans', transformer_layer_params) # Initialize TransformerStack output layer norm if p.ln_output: params = p.ln_tpl.Copy() # Keeping historic 'enc_out_ln' name for checkpoint compatibility. params.name = 'enc_out_ln' params.input_dim = p.model_dim self.CreateChild('layer_norm_out', params) if p.is_transparent: transparent_params = [] if not p.num_transparent_outputs: raise ValueError( 'num_transparent_outputs should be greater than 0.') for i in range(p.num_transparent_outputs): transparent_param = p.transparent_merger_tpl.Copy() transparent_param.name = 'transparent_%d' % i transparent_param.num_sources = 1 + len( transformer_layer_params) transparent_params.append(transparent_param) self.CreateChildren('transparent_merger', transparent_params)
def __init__(self, params): super(GraphLayer, self).__init__(params) p = self.params assert p.name assert p.input_endpoints with tf.variable_scope(p.name): self._seq = [] for i, (signature, sub) in enumerate(p.sub): assert signature sig = GraphSignature(signature) assert sig.outputs, '{}'.format(signature) name = sub.name if not name: name = '%s_%02d' % (sig.outputs[0], i) sub.name = name self.CreateChild(name, sub) self._seq.append((name, sig, self.children[name]))
def __init__(self, params): super(DeterministicWeightsLayer, self).__init__(params) p = self.params if not p.name: raise ValueError('Layer must have a specified name!') assert p.num_sources > 0, ('Must specify num_sources > 0.') params_init = py_utils.WeightInit.Constant(0.0) # Weights to be learned. pw = py_utils.WeightParams( shape=[p.num_sources], init=params_init, dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) with tf.variable_scope(p.name): self.CreateVariable('sum_weight', pw) p.dropout_tpl.name = 'dropout' self.CreateChild('weighted_merger_dropout', p.dropout_tpl)
def __init__(self, params): super(SequentialLayer, self).__init__(params) p = self.params assert p.name with tf.variable_scope(p.name): if p.repeat <= 1: self._seq = [] for sub in p.sub: self.CreateChild(sub.name, sub) self._seq.append((sub.name, self.children[sub.name])) else: # We create 'repeat' number of sub layers. Each sub layer is a # sequential layer specified by 'sub'. This allows us to name each # repetition with a unique name. children = [] for i in range(p.repeat): children.append(p.Copy().Set(name='%03d' % i, repeat=1)) self.CreateChildren('rep', children)
def __init__(self, params): super(BatchNormLayerNoPadding, self).__init__(params) p = self.params assert p.name, 'Name of BatchNormLayerNoPadding is not set.' p.fprop_dtype = None # Skip L-P regularization for these variables. collections = [ self.__class__.__name__ + '_vars', py_utils.SKIP_LP_REGULARIZATION ] pc = py_utils.WeightParams(shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=collections) with tf.variable_scope(p.name): self.CreateVariable('beta', pc) # Note, The real gamma to use is 1 + gamma. self.CreateVariable('gamma', pc, lambda x: 1.0 + x) moving_collections = [ 'moving_vars', tf.GraphKeys.MOVING_AVERAGE_VARIABLES, self.__class__.__name__ + '_vars' ] mva = py_utils.WeightParams(shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=moving_collections) # Two statistics computed from sufficient stats. self.CreateVariable('moving_mean', mva, trainable=False) mvv = py_utils.WeightParams(shape=[p.dim], init=py_utils.WeightInit.Constant(1.0), dtype=p.dtype, collections=moving_collections) self.CreateVariable('moving_variance', mvv, trainable=False) # Accumulate bn sufficient stats over micro-batches. dim = self.vars.beta.shape[0] self.RegisterAccumulator('counts', AddingAccumulator([], p.dtype)) self.RegisterAccumulator('mean_ss', AddingAccumulator([dim], p.dtype)) self.RegisterAccumulator('variance_ss', AddingAccumulator([dim], p.dtype))
def __init__(self, params): super(MTEncoderUniRNN, self).__init__(params) p = self.params assert not p.packed_input, ('Packed inputs are not yet supported for ' 'MTEncoderUniRNN.') with tf.variable_scope(p.name): if p.cc_schedule is None: self.cc_schedule = None else: self.CreateChild('cc_schedule', p.cc_schedule) self.CreateChild('emb', p.emb) rnn_layers_params = [] num_input_nodes = p.emb.embedding_dim for i in range(p.num_lstm_layers): cell = p.lstm_tpl.Copy() cell.name = 'L%d_rnn' % i cell.num_input_nodes = num_input_nodes cell.num_output_nodes = p.lstm_cell_size params = model_helper.CreateUnidirectionalRNNParams( self.params, cell) params.name = 'L%d' % i rnn_layers_params.append(params) num_input_nodes = cell.num_output_nodes self.CreateChildren('rnn', rnn_layers_params) dropout_p = layers.DropoutLayer.Params().Set( name='dropout_layer', keep_prob=1.0 - p.dropout_prob, random_seed=p.random_seed + 827366448 if p.random_seed else None) self.CreateChild('dropout', dropout_p) if p.is_transparent: transparent_params = p.transparent_merger_tpl.Copy() transparent_params.name = 'transparent' transparent_params.num_sources = p.num_lstm_layers self.CreateChild('transparent_merger', transparent_params)
def __init__(self, params): super(QuantizableLayer, self).__init__(params) p = self.params self._tracked_tensors = dict() # tracked t_name -> (QDomain) self._qstate = None # t_name -> Tensor # Instantiate quantization domains. with tf.variable_scope(p.name + '/q'): self._qdomains = dict() # Dict of qdname -> QDomain or None for qdname in dir(p.qdomain): qdparams = p.qdomain.Get(qdname) if qdparams is None: continue assert issubclass(qdparams.cls, QDomain), ( 'Expected quantized domain %s to extend QDomain' % qdname) qdchild_name = 'qdomain_' + qdname self.CreateChild(qdchild_name, qdparams) self._qdomains[qdname] = self.children[qdchild_name] self._AddQuantizationFunctions()
def __init__(self, params): super(SoftCondLayer, self).__init__(params) p = self.params assert p.name assert p.num_experts assert p.cond_dim with tf.variable_scope(p.name): # Create Variables for task weight mapping. collections = [ self.__class__.__name__ + '_vars', ] w_p = py_utils.WeightParams( shape=[p.cond_dim, p.num_experts], init=p.params_init, # TODO(huangyp): try zero init instead. dtype=p.dtype, collections=collections) self.CreateVariable('w', w_p) # Prepends p.num_experts to the tensor shape of every variable created # by p.body. with py_utils.VariableShapePrefixContext(p.num_experts): self.CreateChild('body', p.body)
def __init__(self, params): super(DevBasedSchedule, self).__init__(params) p = self.params with tf.variable_scope(p.name): wp = py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(1.0), collections=['DevBasedSchedule_vars'], dtype=tf.float32) _, self._cur_factor, = py_utils.CreateVariable('cur_factor', wp, trainable=False) wp = py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0), collections=['DevBasedSchedule_vars'], dtype=tf.int64) _, self._ref_step, = py_utils.CreateVariable('ref_step', wp, trainable=False) self._metric_history = early_stop.MetricHistory(p.metric_history) self._best_step = ops.best_step(self._metric_history.hist_file, p.tolerance)
def Apply(self, lr, var_grad): """Applies the gradient to the variable. Args: lr: A scalar. The base learning rate. var_grad: A `.NestedMap` of (var, grad) pairs. Returns: The variable update op. """ optimizer = self.GetOptimizer(lr) def _Apply(): if self.params.use_bf16_gradients_ar: return optimizer.apply_gradients( [(tf.cast(g, tf.float32), v) for (v, g) in var_grad.Flatten()], name='meta_backprop') else: return optimizer.apply_gradients( [(g, v) for (v, g) in var_grad.Flatten()], name='meta_backprop') if not py_utils.use_resource_variables(): var_update_op = _Apply() else: # Many optimizers, e.g., Adam, Adagrad, etc., create # variables. We need to ensure name scope and variable scope are # cleared. Otherwise, tpu.batch_parallel does not work. with tf.name_scope(None): with tf.variable_scope( tf.VariableScope(use_resource=True, reuse=self.VarReuseForSlotVars())): var_update_op = _Apply() self.AddSummary(lr, optimizer, var_grad) return var_update_op
def __init__(self, params): p = params.Copy() num_layers = p.num_encoder_layers + p.num_decoder_layers if isinstance(p.splits, (list, tuple)): assert p.splits[-1] == num_layers for i, j in zip(p.splits[:-1], p.splits[1:]): assert i <= j, 'Splits must be in increasing order.' else: num_splits = p.splits layers_per_split = (num_layers - 1) // num_splits + 1 p.splits = [] for i in range(num_splits): p.splits.append((i + 1) * layers_per_split) p.splits[-1] = num_layers with tf.variable_scope(p.name): transformers = [] if p.is_transparent: p.transparent_merger_tpl.num_sources = p.num_encoder_layers + 1 p.transparent_merger_tpl.dropout_tpl.keep_prob = ( 1 - p.transparent_merger_dropout_prob) # Encoder Embedding layer. if len(p.splits) > 1 or p.num_micro_batches > 1: p.emb_tpl.dropout_tpl = layers.DeterministicDropoutLayer.Params( ) p.emb_tpl.packed_input = p.packed_input p.emb_tpl.is_transparent = p.is_transparent p.emb_tpl.add_tgt_embedding_layer = (p.num_decoder_layers > 0) p.emb_tpl.name = 'emb' p.emb_tpl.batch_dim = p.batch_dim transformers.append(p.emb_tpl) if p.softmax_tpl: p.softmax_tpl.name = 'softmax' p.softmax_tpl.inputs_from_decoder = p.num_decoder_layers > 0 # Encoder layers. for i in range(p.num_encoder_layers): params = p.encoder_tpl.Copy() params.name = 'encoder_%d' % (i) if p.is_transparent: params.is_transparent = p.is_transparent params.final_enc_layer = (i == (p.num_encoder_layers - 1)) if p.normalize_encoder and (i == (p.num_encoder_layers - 1)): params.normalize_output = p.normalize_encoder params.final_enc_layer = (i == (p.num_encoder_layers - 1)) if p.packed_input: params.packed_input = p.packed_input # Use DeterministicDropoutLayer when used in temp graphs. if len(p.splits) > 1 or p.num_micro_batches > 1: params = params.cls.SetupDeterministicDropout(params) assert not params.has_aux_atten if p.is_transparent and i == 0: params.transparent_merger_tpl = p.transparent_merger_tpl.Copy( ) transformers.append(params) # Decoder layers. for i in range(p.num_decoder_layers): params = p.decoder_tpl.Copy() params.name = 'decoder_%d' % (i) params.mask_self_atten = True if p.packed_input: params.packed_input = p.packed_input if len(p.splits) > 1 or p.num_micro_batches > 1: params = params.cls.SetupDeterministicDropout(params) assert params.has_aux_atten transformers.append(params) cells = [] cell_start = 0 # To account for embedding layers in the pipeline. offset = 1 for split, cell_end in enumerate(p.splits): # Layer 0 (embeddings) is always in split 0. sub = transformers[cell_start:(cell_end + offset)] if split == len(p.splits) - 1 and p.softmax_tpl: sub.append(p.softmax_tpl) cell = FeatureExtractionLayer.Params().Set( name='cell_{}'.format(split), sub=sub) cells.append(cell) cell_start = cell_end + offset p.cell_tpl = cells super(GPipeTransformerStack, self).__init__(p) if p.label_smoothing: self.CreateChild('smoother', p.label_smoothing)