def FProp(self, theta, input_batch): p = self.params with tf.name_scope(p.name): inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]), py_utils.assert_shape_match( tf.shape(input_batch.ids), tf.shape(input_batch.paddings)) ], tf.transpose(input_batch.ids)) paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2) if p.packed_input: src_segment_id = tf.expand_dims( tf.transpose(input_batch.segment_ids), 2) else: src_segment_id = None xs = self.emb.EmbLookup(theta.emb, inputs) xs = self.ApplyClipping(theta, xs) summary_utils.histogram('input_emb', xs) xs = self.dropout.FProp(theta.dropout, xs) ps = paddings # Now the rnn layers. outputs_list = [] for i in range(0, p.num_lstm_layers): layer = self.rnn[i] ys = layer.FProp(theta.rnn[i], xs, ps, segment_id=src_segment_id) ys = self.dropout.FProp(theta.dropout, ys) if i >= p.residual_start: xs += ys # Residual skip xs = self.ApplyClipping(theta, xs) else: xs = ys outputs_list.append(xs) summary_utils.histogram('layer_out_%s' % i, xs) if p.is_transparent: xs = self.transparent_merger.FProp(theta.transparent_merger, outputs_list) if p.lstm_cell_size * 2 != p.encoder_out_dim: # Project to the right depth. xs = self.final_proj.FProp(theta.final_proj, xs, ps) summary_utils.histogram('final_proj_out', xs) if src_segment_id is not None: src_segment_id = tf.squeeze(src_segment_id, [2]) return py_utils.NestedMap( encoded=xs, padding=tf.squeeze(ps, [2]), segment_id=src_segment_id)
def _update_mask(self, weights, threshold): """Updates the mask for a given weight tensor. This functions first computes the cdf of the weight tensor, and estimates the threshold value such that 'desired_sparsity' fraction of weights have magnitude less than the threshold. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if sparsity is not defined """ if self._sparsity is None: raise ValueError('Sparsity variable undefined') sparsity = self._get_sparsity(weights.op.name) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(weights) k = tf.cast( tf.round( tf.cast(tf.size(abs_weights), tf.float32) * (1 - sparsity)), tf.int32) # Sort the entire array values, _ = tf.nn.top_k(tf.reshape(abs_weights, [-1]), k=tf.size(abs_weights)) # Grab the (k-1) th value current_threshold = tf.gather(values, k - 1) smoothed_threshold = tf.add_n([ tf.multiply(current_threshold, 1 - self._spec.threshold_decay), tf.multiply(threshold, self._spec.threshold_decay) ]) new_mask = tf.cast( tf.greater_equal(abs_weights, smoothed_threshold), tf.float32) return smoothed_threshold, new_mask
def _FPropLm(self, theta, state0, ids, paddings, misc=None): """LM FProp. Works for single step or entire seq. Args: theta: A NestedMap object containing weights for the layer and its children. state0: A NestedMap of states (specific to the layer). ids: Target ids, of shape [batch_size] for single step unrolling or [seq_len, batch_size] for the entire sequence. paddings: Target paddings, of the same shape as 'ids'. misc: NestedMap of miscellaneous items, which might be needed during training. Returns: (lm_output, state1): - lm_output: A NestedMap containing lm output. If 'ids' is 1-D, then lm_output should have shape [batch_size, dim]; if it is 2-D then the shape should be [seq_len, batch_size, dim]. - state1: A NestedMap of updated states. """ state1 = state0.DeepCopy() if isinstance(ids.shape, tf.TensorShape): is_single_step = (ids.shape.rank == 1) else: is_single_step = len(ids.shape) == 1 if is_single_step: seq_len = 1 else: seq_len = tf.shape(ids)[0] self._ModifyLmBeforeFProp(theta, state0, ids, paddings, misc) with tf.name_scope('lm'): ids = tf.reshape(ids, [seq_len, -1], name='reshape_ids') paddings = tf.reshape(paddings, [seq_len, -1], name='reshape_paddings') lm_output, state1.lm_states = self.lm.FProp(theta.lm, ids, paddings, state0.lm_states) if is_single_step: # lm outputs have dimension [time, batch, dim]. Since this is only one # step, remove time dimension. lm_output = lm_output.Transform(lambda v: tf.squeeze(v, axis=0)) return lm_output, state1
def Inference(self): """Computes y = w^T x + b. Returns y and x, as outputs and inputs.""" # Add a dummy file def to the collection filename = tf.convert_to_tensor('dummy.txt', tf.dtypes.string, name='asset_filepath') tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.ASSET_FILEPATHS, filename) with tf.name_scope('inference'): x = tf.placeholder(dtype=tf.float32, name='input') r = tf.random.stateless_uniform([3], seed=py_utils.GenerateStepSeedPair( self.params, self.theta.global_step)) y = tf.reduce_sum((self.vars.w + r) * x) + self.vars.b return {'default': ({'output': y}, {'input': x})}
def factorized_pool(input_tensor, window_shape, pooling_type, strides, padding, name=None): """Performs m x n pooling through a combination of 1xm and 1xn pooling. Args: input_tensor: Input tensor. Must be rank 2 window_shape: Pooling window shape pooling_type: Either 'MAX' or 'AVG' strides: The stride of the pooling window padding: 'SAME' or 'VALID'. name: Name of the op Returns: A rank 2 tensor containing the pooled output Raises: ValueError: if the input tensor is not rank 2 """ if input_tensor.get_shape().ndims != 2: raise ValueError('factorized_pool() accepts tensors of rank 2 only') [height, width] = input_tensor.get_shape() with tf.name_scope(name, 'factorized_pool'): input_tensor_aligned = tf.reshape(input_tensor, [1, 1, height, width], name=input_tensor.op.name + '_aligned') height_pooling = tf.nn.pool(input_tensor_aligned, window_shape=[1, window_shape[0]], pooling_type=pooling_type, strides=[1, strides[0]], padding=padding) swap_height_width = tf.transpose(height_pooling, perm=[0, 1, 3, 2]) width_pooling = tf.nn.pool(swap_height_width, window_shape=[1, window_shape[1]], pooling_type=pooling_type, strides=[1, strides[1]], padding=padding) return tf.squeeze(tf.transpose(width_pooling, perm=[0, 1, 3, 2]), axis=[0, 1])
def _SelfVariableScope(self, params=None, enter_name_scope=True): """Internal. Used to ensure the same variable & name scopes are used.""" if not hasattr(self, '_self_variable_scope'): params = params or self.params self._parent_variable_scope = tf.get_variable_scope() with tf.variable_scope(py_utils.SanitizeScopeKey( params.name)) as scope: self._self_variable_scope = scope with contextlib.ExitStack() as stack: stack.enter_context( tf.variable_scope(self._self_variable_scope, auxiliary_name_scope=False)) if enter_name_scope: stack.enter_context( tf.name_scope( self._self_variable_scope.original_name_scope)) yield stack
def Step(recurrent_theta, state0, inputs): """Computes one decoder step.""" del inputs with tf.name_scope('single_sampler_step'): # Compute logits and states. bs_result, bs_state1 = pre_step_callback( decoder_theta, recurrent_theta.encoder_outputs, tf.expand_dims(state0.ids, 1), # [batch, 1]. state0.bs_state, num_hyps_per_beam=1) batch = tf.shape(bs_result.log_probs)[0] state1 = py_utils.NestedMap(timestep=state0.timestep + 1) state1.logits = bs_result.log_probs if p.top_k > 0: topk_logits, topk_ids = tf.math.top_k(state1.logits, k=p.top_k) sample_logits = tf.nn.log_softmax( topk_logits) if p.top_k_renormalize else topk_logits else: sample_logits = state1.logits # Sample ids from logits. [batch]. ids = tf.reshape( tf.random.stateless_categorical( sample_logits / p.temperature, num_samples=1, seed=tf.stack( [recurrent_theta.random_seed, state0.timestep]), dtype=state0.ids.dtype, name='sample_next_id'), [batch]) state1.ids = tf.gather(topk_ids, ids, axis=1, batch_dims=1) if p.top_k > 0 else ids if 'is_last_chunk' in bs_result and p.target_eoc_id >= 0: state1.ids = tf.where( tf.math.logical_and( bs_result.is_last_chunk, tf.equal(state1.ids, p.target_eoc_id)), tf.fill(tf.shape(state1.ids), p.target_eos_id), state1.ids) state1.bs_state = post_step_callback( decoder_theta, recurrent_theta.encoder_outputs, state1.ids, bs_state1) return state1, py_utils.NestedMap()
def FProp(self, theta, source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, transparent_acc, transparent_acc_helper, source_task_id=None, target_task_id=None): with tf.name_scope(self.params.name): return _common_gpipe_transformer_encoder_fprop( self, GPipeEvolvedTransformerEncoderLayer, theta, source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, None, None, source_task_id, target_task_id)
def TraverseLayer(layer, fn): """Traverses the layer tree and invokes fn(node) on each node. Args: layer: a BaseLayer. fn: a function of (layer, layer_theta) -> None. """ if isinstance(layer, (list, tuple)): for layer_i in layer: TraverseLayer(layer_i, fn) return with tf.name_scope(layer.params.name): fn(layer) # Traverse all children in alphabetical order. for _, child in sorted(layer.children.items()): TraverseLayer(child, fn)
def Inference(self): """Builds the inference graph. Default subgraph should return: predicted_bboxes: A [batch_size, num_boxes, 7] float Tensor. classification_scores: A [batch_size, num_boxes, num_classes] float Tensor. Returns: A dictionary whose values are a tuple of fetches and feeds. """ p = self.params subgraphs = {} with tf.name_scope('inference'): input_placeholders = self._Placeholders() predictions = self.ComputePredictions(self.theta, input_placeholders) bboxes_and_logits = self._BBoxesAndLogits(input_placeholders, predictions) predicted_bboxes = bboxes_and_logits.predicted_bboxes classification_logits = bboxes_and_logits.classification_logits classification_scores = tf.sigmoid(classification_logits) _, per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = ( detection_decoder.DecodeWithNMS( predicted_bboxes, classification_scores, nms_iou_threshold=p.nms_iou_threshold, score_threshold=p.nms_score_threshold, max_boxes_per_class=p.max_nms_boxes, use_oriented_per_class_nms=p.use_oriented_per_class_nms)) per_cls_bbox_scores *= per_cls_valid_mask # TODO(vrv): Fix the inference graph for KITTI, since we need # to apply frustum clipping. This requires customizing the # inference placeholders for each model. fetches = { 'per_class_predicted_bboxes': per_cls_bboxes, 'per_class_predicted_bbox_scores': per_cls_bbox_scores, 'per_class_valid_mask': per_cls_valid_mask } subgraphs['default'] = fetches, dict( input_placeholders.FlattenItems()) return subgraphs
def ZeroState(self, theta, prepared_inputs, batch_size): """Creates a zero state NestedMap for this step. Args: theta: variables used by sub-steps. prepared_inputs: Output from a call to PrepareExternalInputs. batch_size: The number of items in the batch that FProp will process. Returns: A NestedMap of ZeroState results for each sub-step. """ state0 = py_utils.NestedMap() with tf.name_scope(self.params.name): for seq in self._seq: state0[seq.name] = seq.step.ZeroState( theta[seq.name], prepared_inputs[seq.name], batch_size) return state0
def StreamStep(self, theta, inputs, paddings, state0): """Runs single step. Args: theta: A NestedMap of layer params. inputs: [b, 1, d]. paddings: A 0/1 valued tensor of shape [b, 1]. state0: A NestedMap of tensors of the same struct as returned by zero_state(). Returns: outputs: A NestedMap of tensors consisting: padding: the same as input paddings. state1: A NestedMap of tensors of the same struct as state0. """ p = self.params assert p.is_causal state1 = py_utils.NestedMap() with tf.name_scope(f'{p.name}/StreamStep'): unnormalized_inputs = inputs inputs = self.ln.FProp(theta.ln, inputs) inputs = self.linear_start.FProp(theta.linear_start, inputs) inputs = self._GLU(inputs) # TODO(jamesqin): inroduce depthwise conv2d with 3d inputs. # TODO(jamesqin): optimize DepthwiseConv1D.StreamStep() # [b, t, d] --> [b, t, 1, d] inputs = tf.expand_dims(inputs, 2) # [b, t, 1, d] inputs, paddings, conv_state1 = self.depthwise_conv1d.StreamStep( theta.depthwise_conv1d, inputs, paddings, state0.conv_state) state1.conv_state = conv_state1 # [b, t, d] inputs = self._NormalizeStep(theta, inputs, paddings, state0, state1) inputs = self._ApplyActivation(inputs, p.conv_activation) inputs = self.linear_end.FProp(theta.linear_end, inputs) inputs = self.dropout.FProp(theta.dropout, inputs) output = inputs + unnormalized_inputs return output, paddings, state1
def FProp(self, theta, input_batch, state0=None): p = self.params src_segment_id = None with tf.name_scope(p.name): # Reshape to [t, b] inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]), py_utils.assert_shape_match( tf.shape(input_batch.ids), tf.shape(input_batch.paddings)) ], tf.transpose(input_batch.ids)) paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2) # Setup streaming states. if not state0: state0 = self.zero_state(theta, tf.shape(inputs)[1]) state1 = py_utils.NestedMap(rnn=[None] * p.num_lstm_layers) xs = self.emb.EmbLookup(theta.emb, inputs) xs = self.ApplyClipping(theta, xs) summary_utils.histogram('input_emb', xs) xs = self.dropout.FProp(theta.dropout, xs) ps = paddings # Now the rnn layers. outputs_list = [] for i in range(0, p.num_lstm_layers): layer = self.rnn[i] ys, state1.rnn[i] = layer.FProp( theta.rnn[i], xs, ps, state0=state0.rnn[i]) ys = self.dropout.FProp(theta.dropout, ys) if i >= p.residual_start: xs += ys # Residual skip xs = self.ApplyClipping(theta, xs) else: xs = ys outputs_list.append(xs) summary_utils.histogram('layer_out_%s' % i, xs) if p.is_transparent: xs = self.transparent_merger.FProp(theta.transparent_merger, outputs_list) return py_utils.NestedMap( encoded=xs, padding=tf.squeeze(ps, [2]), segment_id=src_segment_id, state=state1)
def FProp(self, theta, source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, labels, label_weights, transparent_acc, transparent_acc_helper): p = self.params with tf.name_scope(p.name): if p.has_aux_atten: # Decoder FProp return _common_gpipe_transformer_decoder_fprop( self, GPipeTransformerLayer, theta, source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, labels, label_weights, transparent_acc, transparent_acc_helper) else: # Encoder FProp return _common_gpipe_transformer_encoder_fprop( self, GPipeTransformerLayer, theta, source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, labels, label_weights, transparent_acc, transparent_acc_helper)
def Inference(self): if py_utils.use_tpu(): raise NotImplementedError('TPU is not supported.') with tf.name_scope('inference'): feed1 = tf.placeholder(name='feed1_node', dtype=tf.float32, shape=[1]) fetch1 = tf.identity(feed1, name='fetch1_node') return { 'default': ( py_utils.NestedMap({ 'fetch1': fetch1, 'fetch_op': fetch1.op, # Tests that ops are supported. }), py_utils.NestedMap({ 'feed1': feed1, })), 'unused': (py_utils.NestedMap({}), py_utils.NestedMap({})), }
def FProp(self, theta, inputs, *args): p = self.params with tf.name_scope(p.name) as scope: expert_dist = self._GetExpertDist(theta, inputs, *args) if not self.do_eval: summary_utils.histogram('soft_cond_{}'.format(scope), expert_dist) # Excludes non-variable extra_theta like global_step. var_set = set([key for key, _ in self.body.vars.FlattenItems()]) values = [] for key, value in theta.body.FlattenItems(): if key in var_set and value is not None: # Weighted average for all variables created in the body layer. value = tf.einsum('i,i...->...', expert_dist, value) values.append(value) weighted_theta = theta.body.Pack(values) return self.body.FProp(weighted_theta, inputs, *args)
def _InferenceSubgraph_Default(self): """Constructs graph for offline inference. Returns: (fetches, feeds) where both fetches and feeds are dictionaries. Each dictionary consists of keys corresponding to tensor names, and values corresponding to a tensor in the graph which should be input/read from. """ p = self.params with tf.name_scope('default'): # TODO(laurenzo): Once the migration to integrated frontends is complete, # this model should be upgraded to use the MelAsrFrontend in its # params vs relying on pre-computed feature generation and the inference # special casing. wav_bytes = tf.placeholder(dtype=tf.string, name='wav') frontend = self.frontend if p.frontend else None if not frontend: # No custom frontend. Instantiate the default. frontend_p = asr_frontend.MelAsrFrontend.Params() frontend = frontend_p.Instantiate() # Decode the wave bytes and use the explicit frontend. unused_sample_rate, audio = audio_lib.DecodeWav(wav_bytes) audio *= 32768 # Remove channel dimension, since we have a single channel. audio = tf.squeeze(audio, axis=1) # Add batch. audio = tf.expand_dims(audio, axis=0) input_batch_src = py_utils.NestedMap(src_inputs=audio, paddings=tf.zeros_like(audio)) input_batch_src = frontend.FPropDefaultTheta(input_batch_src) encoder_outputs = self.encoder.FPropDefaultTheta(input_batch_src) decoder_outputs = self.decoder.BeamSearchDecode(encoder_outputs) topk = self._GetTopK(decoder_outputs) feeds = {'wav': wav_bytes} fetches = { 'hypotheses': topk.decoded, 'scores': topk.scores, 'src_frames': input_batch_src.src_inputs, 'encoder_frames': encoder_outputs.encoded } return fetches, feeds
def FProp(self, theta, external_inputs, step_inputs, padding, state0): """A single inference step for this step graph. Args: theta: variables used by sub-steps. external_inputs: A NestedMap containing external_inputs that were pre-processed by the PrepareExternalInputs method of each sub-step. The keys are the names of the sub-steps. step_inputs: A NestedMap of [batch, ...] tensors. The structure of this depends on the graph implementation. padding: A 0/1 float tensor of shape [batch_size]; 1.0 means that this batch element is empty in this step. state0: A NestedMap of state variables produced by either ZeroState or a previous invocation of this FProp step. The keys are the names of the sub-steps. Returns: (output, state1), both of which are NestedMaps. output is implementation-dependent and is defined by the output_signature parameter. state1 is a NestedMap where the keys are names of sub-steps and the values are state outputs from their FProp methods. """ p = self.params graph_tensors = builder_layers.GraphTensors() graph_tensors.StoreTensor('external_inputs', external_inputs) graph_tensors.StoreTensor('step_inputs', step_inputs) state1 = py_utils.NestedMap() with tf.name_scope(p.name): for seq in self._seq: tf.logging.vlog(1, 'GraphStep: call %s', seq.name) external = None if seq.external_signature: external = external_inputs[seq.name] template = py_utils.NestedMap(inputs=seq.signature.inputs) packed = template.Transform(graph_tensors.GetTensor) input_args = packed.inputs[0] out, seq_state1 = seq.step.FProp(theta[seq.name], external, input_args, padding, state0[seq.name]) graph_tensors.StoreTensor(seq.signature.outputs[0], out) state1[seq.name] = seq_state1 template = py_utils.NestedMap(inputs=self.output_signature.inputs) output_tensors = template.Transform(graph_tensors.GetTensor).inputs[0] return output_tensors, state1
def FProp(self, theta, inputs, paddings=None): """Apply batch normalization. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [..., dim]. paddings: The paddings tensor. Shaped [..., 1], with the same rank as the input tensor. Returns: Output after applying batch normalization, with the same shape as 'inputs'. """ p = self.params if paddings is None: paddings = self._GetDefaultPaddings(inputs) with tf.name_scope(p.name): norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments( theta, inputs, paddings) with tf.control_dependencies([ py_utils.assert_greater_equal( norm_variance, tf.zeros_like(norm_variance)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_mean)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_variance)), ]): if p.use_fused_batch_norm_for_eval and self.do_eval: bn_output, _, _ = nn.fused_batch_norm(inputs, gamma, beta, norm_mean, norm_variance, self._epsilon, is_training=False) else: bn_output = tf.nn.batch_normalization( inputs, norm_mean, norm_variance, beta, gamma, self._epsilon) if p.set_padded_output_to_zero: bn_output *= 1.0 - paddings return bn_output
def StreamStep(self, theta, inputs, paddings, state0): """Apply a single step of convolution to input_tensor. Only supports 1d causal convolution. Doesn't support dilation. Args: theta: A NestedMap of layer params. inputs: A Tensor of shape [b, t, 1, c] paddings: A 0/1 valued tensor of shape [b, t]. state0: A NestedMap of tensors of the same struct as returned by zero_state(). Returns: outputs: A Tensor of shape [b, t, 1, c * channel_multiplier] padding: the same as input paddings. state1: A NestedMap of the same struct as input state """ p = self.params assert p.filter_shape[1] == 1, ( 'StreamStep only supports 1d causal convolution.') assert p.filter_stride[0] == 1, ( 'StreamStep doesn\'t support striding') assert p.dilation_rate == (1, 1), ('StreamStep doesn\'t support dilation') with tf.name_scope(p.name): inputs = py_utils.HasShape(inputs, [-1, -1, 1, p.filter_shape[2]]) paddings = py_utils.HasShape(paddings, py_utils.GetShape(inputs)[:2]) q = py_utils.GetShape(paddings)[1] padded_inputs = py_utils.ApplyPadding( py_utils.AppendDims(paddings, 2), inputs) concat_inputs = tf.concat([state0.context, padded_inputs], axis=1) outputs = tf.nn.depthwise_conv2d(concat_inputs, self._GetWeight(theta), strides=(1, 1, 1, 1), dilations=(1, 1), data_format='NHWC', padding='VALID') if p.bias: outputs = tf.nn.bias_add(outputs, theta.b) new_context = concat_inputs[:, q:] return outputs, paddings, py_utils.NestedMap(context=new_context)
def _FProp(self, theta, inputs, paddings): p = self.params with tf.name_scope(p.name): inputs = self.fflayer_start.FProp(theta.fflayer_start, inputs, paddings) if p.layer_order == 'mhsa_before_conv': inputs, paddings = self._SelfAtten(theta, inputs, paddings) inputs, paddings = self._LConv(theta, inputs, paddings) else: assert p.layer_order == 'conv_before_mhsa' inputs, paddings = self._LConv(theta, inputs, paddings) inputs, paddings = self._SelfAtten(theta, inputs, paddings) inputs = self.fflayer_end.FProp(theta.fflayer_end, inputs, paddings) inputs = self.final_ln.FProp(theta.final_ln, inputs) return inputs, paddings
def FProp(self, theta, source_id, source_paddings, target_id, target_paddings, source_segment_id, target_segment_id, labels, label_weights, source_pos_id, target_pos_id): p = self.params with tf.name_scope(p.name): source_vecs = self.GetEmbeddings( theta.src_token_emb, self.src_token_emb, theta.src_pos_emb, self.src_pos_emb, theta.src_dropout, self.src_dropout, source_id, source_pos_id) target_vecs = None if p.add_tgt_embedding_layer: target_vecs = self.GetEmbeddings( theta.tgt_token_emb, self.tgt_token_emb, theta.tgt_pos_emb, self.tgt_pos_emb, theta.tgt_dropout, self.tgt_dropout, target_id, target_pos_id) return (source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, labels, label_weights, None, None)
def Value(self): p = self.params with tf.name_scope(p.name): steps = self._best_step best_step = steps[0] last_step = steps[1] ref_step = tf.maximum(self.theta.ref_step, best_step) f = self.theta.cur_factor # Decay if no improvement within window. new_factor = tf.where(last_step - ref_step < p.window, f, tf.maximum(p.min_factor, f * p.decay)) # Update ref_step if we decayed. new_step = tf.where(tf.equal(new_factor, f), ref_step, last_step) update_step = tf.assign(self.vars.ref_step, new_step) with tf.control_dependencies([update_step]): return tf.assign(self.vars.cur_factor, new_factor)
def _unrolled_fprop(self, theta, *args): p = self.params fprop_inputs = args with tf.name_scope(p.name): for layer_idx in range(p.repeat): if p.per_layer_vars: layer_theta = theta['body_iter_%05d' % layer_idx] else: def _Slice(t, idx=layer_idx): return t[idx] layer_theta = tf.nest.map_structure(_Slice, theta.body) fprop_outputs = self._body.FProp(layer_theta, *fprop_inputs) fprop_outputs = _ToTuple(fprop_outputs) assert len(fprop_outputs) == len(fprop_inputs) fprop_inputs = fprop_outputs return fprop_outputs[0] if len(fprop_outputs) == 1 else fprop_outputs
def _check_paddings(self, paddings): with tf.name_scope('check_paddings'): unpacked_paddings = tf.unstack(paddings) non_decr = [] for t in unpacked_paddings: non_d = tf.is_non_decreasing(t) non_decr.append(non_d) all_non_decr = tf.stack(non_decr) paddings = py_utils.with_dependencies([ tf.assert_equal(tf.reduce_any(tf.equal(paddings, 0.0)), True, message='must have at least one zero value.'), tf.assert_equal( all_non_decr, True, message='must be non-decreasing') ], paddings) return paddings
def Inference(self): with tf.name_scope('inference'): feed1 = tf.placeholder(name='feed1_node', dtype=tf.float32, shape=[1]) fetch1 = tf.identity(feed1, name='fetch1_node') feed2 = tf.placeholder(name='feed2_node', dtype=tf.float32, shape=[2]) fetch2 = tf.identity(feed2, name='fetch2_node') inference_graph = inference_graph_pb2.InferenceGraph() subgraph = inference_graph.subgraphs['default'] subgraph.feeds['feed1'] = feed1.name subgraph.fetches['fetch1'] = fetch1.name subgraph = inference_graph.subgraphs['subgraph2'] subgraph.feeds['feed1'] = feed2.name subgraph.fetches['fetch1'] = fetch2.name return inference_graph
def Apply(self, lr, var_grad): """Applies the gradient to the variable. Args: lr: A scalar or callable that returns the base learning rate. var_grad: A `.NestedMap` of (var, grad) pairs. Returns: The variable update op. Raises: RuntimeError: When `lr` is not a callable in Eager mode. """ # In Graph mode, always re-create the optimizer to remain consistent with # the old logic for the Graph trainer. # TODO(jiaweix): Recreating optimizers in Graph mode seems unnecessary. if self._optimizer is None or not py_utils.IsEagerMode(): self._optimizer = self.GetOptimizer(lr) def _Apply(): return self._optimizer.apply_gradients( [(g, v) for (v, g) in var_grad.Flatten()], name='meta_backprop') clear_variable_scope = self.params.clear_variable_scope if clear_variable_scope is None: clear_variable_scope = not py_utils.IsEagerMode() if clear_variable_scope: # Many optimizers, e.g., Adam, Adagrad, etc., create # variables. We need to ensure name scope and variable scope are # cleared. Otherwise, tpu.batch_parallel does not work. with tf.name_scope(None): with tf.variable_scope( tf.VariableScope(use_resource=True, reuse=self.VarReuseForSlotVars())): var_update_op = _Apply() else: var_update_op = _Apply() if self.params.add_summary_in_apply: lr_value = GetLrValue(lr) self.AddSummary(lr_value, self._optimizer, var_grad) return var_update_op
def FProp(self, theta, inputs, paddings): """Builds FProp graph. Args: theta: A NestedMap of Tensors, see base class. inputs: A Tensor of shape [batch, seqlen, dim0]. paddings: A Tensor of shape [batch, seqlen]. Returns: output: A Tensor of shape [batch, seqlen, dim0]. out_paddings: A Tensor of shape [batch, seqlen]. """ p = self.params with tf.name_scope(p.name): unnormalized_inputs = inputs inputs = self.ln.FProp(theta.ln, inputs) if p.split_act_gated_linear_start: act_inputs = self.linear_start_act.FProp( theta.linear_start_act, inputs) gated_inputs = self.linear_start_gated.FProp( theta.linear_start_gated, inputs) else: inputs = self.linear_start.FProp(theta.linear_start, inputs) gated_inputs, act_inputs = tf.split(inputs, 2, axis=-1) inputs = self._GLU(gated_inputs, act_inputs) # TODO(jamesqin): inroduce depthwise conv2d with 3d inputs. # [b, t, d] --> [b, t, 1, d] inputs = tf.expand_dims(inputs, 2) theta.depthwise_conv1d.w = moe_layers.Split( theta.depthwise_conv1d.w, 2, p.xla_num_partitions) inputs, paddings = self.depthwise_conv1d.FProp( theta.depthwise_conv1d, inputs, paddings) inputs = self._Normalize(theta, inputs, paddings) inputs = self._ApplyActivation(inputs, p.conv_activation) inputs = self.linear_end.FProp(theta.linear_end, inputs) inputs = self.dropout.FProp(theta.dropout, inputs) output = inputs + unnormalized_inputs return output, paddings
def TpuEvalStep(*args): """Eval a shard of a batch on a single TPU core. Args: *args: metrics values from previous steps. Returns: Summed eval metrics. """ with tf.name_scope('tpu_eval'): with py_utils.OpportunisticVariableReuseScope(True): self._model.InstantiateVariables() self._model.ConstructFPropGraph() per_step_eval_metrics = self._eval_metrics.SetMetrics( self._task.eval_metrics, args) summed_metrics = [] for x, y in zip(per_step_eval_metrics, args): summed_metrics.append(x + y) return summed_metrics
def FProp(self, theta, *args): p = self.params with tf.name_scope(p.name): tf.logging.vlog(1, 'layer %s', self.params.name) if p.repeat <= 1: for (name, ch) in self._seq: th = theta[name] args = _ToTuple(args) tf.logging.vlog(1, 'SequentialLayer: call %s %s %d %s', ch.params.name, ch, len(args), str(args)) args = ch.FProp(th, *args) else: for (ch, th) in zip(self.rep, theta.rep): args = _ToTuple(args) tf.logging.vlog(1, ' call %s %s %d %s', ch.params.name, ch, len(args), str(args)) args = ch.FProp(th, *args) args = _ToTuple(args) return args[0] if len(args) == 1 else args