def testDecoderFPropWithMeanSeqLoss(self): """Create and fprop a decoder with different dims per layer.""" with self.session(use_gpu=False) as sess: tf.random.set_seed(8372749040) p = self._DecoderParams(vn_config=py_utils.VariationalNoiseParams( None, True, False, seed=12345)) p.token_normalized_per_seq_loss = True p.per_token_avg_loss = False metrics, per_sequence_loss = self._getDecoderFPropMetrics(params=p) self.evaluate(tf.global_variables_initializer()) metrics_val, per_sequence_loss_val = sess.run( [metrics, per_sequence_loss]) tf.logging.info('metrics=%s, per_sequence_loss=%s', metrics_val, per_sequence_loss_val) self.assertNotEqual(metrics_val['loss'][0], metrics_val['log_pplx'][0]) self.assertAllClose(metrics_val['loss'], (3.484608, 4.0)) self.assertAllClose(metrics_val['log_pplx'], (3.496482, 15.0)) # Target batch size is 4. Therefore, we should expect 4 here. self.assertEqual(per_sequence_loss_val.shape, (4, ))
def Params(cls): """Configs for `MTEncoderUniRNN`.""" p = super(MTEncoderUniRNN, cls).Params() p.Define('emb', layers.EmbeddingLayer.Params(), 'Embedding layer params.') p.Define('lstm_tpl', rnn_cell.LSTMCellSimple.Params(), 'Configs template for the RNN layer.') p.Define('lstm_cell_size', 512, 'LSTM cell size for the RNN layer.') p.Define('num_lstm_layers', 8, 'Number of rnn layers to create') p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.') p.Define('residual_start', 2, 'Layer at which we start residual connections.') p.Define( 'unidi_rnn_type', 'func', 'Options: func, native_cudnn. ' 'func: FRNN, native_cudnn: CuDNNLSTM.') p.Define('cc_schedule', None, 'Clipping cap schedule.') p.Define('is_transparent', False, 'If set, outputs a merger of layer outputs.') p.Define( 'transparent_merger_tpl', layers.WeightedSumLayer.Params().Set(add_weight_summaries=True), 'Merger op for layer outputs.') disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Uniform(0.04) # Default config for the embedding. p.emb.vn = disable_vn p.emb.vocab_size = 32000 p.emb.embedding_dim = 1024 p.emb.max_num_shards = 16 p.emb.params_init = default_params_init p.lstm_tpl.vn = disable_vn p.lstm_tpl.params_init = default_params_init return p
def Params(cls): """Configs for `MTEncoderV1`.""" p = super(MTEncoderV1, cls).Params() p.Define('emb', layers.EmbeddingLayer.Params(), 'Embedding layer params.') p.Define('lstm_tpl', rnn_cell.LSTMCellSimple.Params(), 'Configs template for the RNN layer.') p.Define('lstm_tpl_uni', None, 'Override configs template for the unidirectional RNN layers.') p.Define('lstm_tpl_bidi', None, 'Override configs template for the bidirectional RNN layer.') p.Define('lstm_cell_size', 1024, 'LSTM cell size for the RNN layer.') p.Define('num_lstm_layers', 8, 'Number of rnn layers to create') p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.') p.Define('unidi_rnn_type', 'func', 'Options: func. ' 'func: FRNN.') p.Define('bidi_rnn_type', 'func', 'Options: func. ' 'func: BidirectionalFRNN. ') p.Define('cc_schedule', None, 'Clipping cap schedule.') p.Define( 'packed_input', False, 'If True, encoder and all layers support ' 'multiple examples in a single sequence.') disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Uniform(0.04) # Default config for the embedding. p.emb.vn = disable_vn p.emb.vocab_size = 32000 p.emb.embedding_dim = 1024 p.emb.max_num_shards = 16 p.emb.params_init = default_params_init for tpl in [p.lstm_tpl, p.lstm_tpl_uni, p.lstm_tpl_bidi]: if tpl is not None: tpl.vn = disable_vn tpl.params_init = default_params_init return p
def testDecoderFPropDeterministicAttentionDropout(self): """Verify that attention dropout is deterministic given fixed seeds.""" with self.session(use_gpu=False) as sess: tf.set_random_seed(8372749040) p = self._DecoderParams( py_utils.VariationalNoiseParams(None, True, False, seed=1792)) p.use_while_loop_based_unrolling = False p.attention.atten_dropout_prob = 0.5 p.attention.atten_dropout_deterministic = True loss, per_sequence_loss = self._testDecoderFPropHelper(params=p) global_step = py_utils.GetGlobalStep() tf.global_variables_initializer().run() loss_val, per_sequence_loss_val, global_steps_val = sess.run( [loss, per_sequence_loss, global_step]) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.587372, 15.0], loss_val) self.assertAllClose([14.171288, 9.965696, 10.221684, 19.451914], per_sequence_loss_val) self.assertEqual(0, global_steps_val) # Run another step to test global_step and time_step are incremented # correctly. sess.run(tf.assign_add(global_step, 1)) loss_val, per_sequence_loss_val, global_steps_val = sess.run( [loss, per_sequence_loss, global_step]) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.626164, 15.0], loss_val) self.assertAllClose([14.70993, 10.572938, 10.516836, 18.592758], per_sequence_loss_val) self.assertEqual(1, global_steps_val)
def testDecoderFPropWithProjection(self): """Create decoder with projection layers, and verify that FProp runs.""" with self.session(use_gpu=False) as sess: tf.set_random_seed(8372749040) p = self._DecoderParams( vn_config=py_utils.VariationalNoiseParams( None, True, False, seed=12345)) rnn_cell_tpl = p.rnn_cell_tpl p.rnn_cell_tpl = [ rnn_cell_tpl.Copy().Set( num_output_nodes=i + 2, num_hidden_nodes=i + 5) for i in range(p.rnn_layers) ] p.rnn_cell_dim = -1 p.rnn_cell_hidden_dim = -1 loss, per_sequence_loss = self._testDecoderFPropHelper(params=p) tf.global_variables_initializer().run() loss_val, per_sequence_loss_val = sess.run([loss, per_sequence_loss]) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) # Target batch size is 4. Therefore, we should expect 4 here. self.assertEqual(per_sequence_loss_val.shape, (4,))
def _testComputePredictionsHelper(self, use_while_loop_based_unrolling=False, confidence_module=False): """Create decoder and confidence prediction, and verify that FProp runs.""" with self.session(): p = _DecoderParams( vn_config=py_utils.VariationalNoiseParams( None, True, False, seed=12345)) p.use_while_loop_based_unrolling = use_while_loop_based_unrolling if confidence_module: p.confidence = lingvo_layers.FeedForwardNet.Params() p.confidence.hidden_layer_dims = [8, 1] p.confidence.activation = ['RELU', 'NONE'] dec = p.Instantiate() encoder_outputs, targets = _CreateSourceAndTargets(p) predictions = dec.ComputePredictions(dec.theta, encoder_outputs, targets) self.evaluate(tf.global_variables_initializer()) predictions_val = self.evaluate(predictions) self.assertAllEqual(predictions_val['logits'].shape, [4, 5, 32]) self.assertAllEqual(predictions_val['softmax_input'].shape, [5, 4, 12]) if p.confidence is not None: self.assertAllEqual(predictions_val['confidence_scores'].shape, [4, 5])
def testDecoderFPropDeterministicAttentionDropout(self): """Verify that attention dropout is deterministic given fixed seeds.""" with self.session(use_gpu=False): tf.random.set_seed(8372749040) p = _DecoderParams( py_utils.VariationalNoiseParams(None, True, False, seed=1792)) p.use_while_loop_based_unrolling = False p.attention.atten_dropout_prob = 0.5 p.attention.atten_dropout_deterministic = True loss, per_sequence_loss = self._testDecoderFPropHelper(params=p) global_step = py_utils.GetGlobalStep() self.evaluate(tf.global_variables_initializer()) loss_val, per_sequence_loss_val, global_steps_val = self.evaluate( [loss, per_sequence_loss, global_step]) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.332992, 15.0], loss_val) self.assertAllClose([13.942583, 9.632538, 9.677502, 16.742266], per_sequence_loss_val) self.assertEqual(0, global_steps_val) # Run another step to test global_step and time_step are incremented # correctly. self.evaluate(tf.assign_add(global_step, 1)) loss_val, per_sequence_loss_val, global_steps_val = self.evaluate( [loss, per_sequence_loss, global_step]) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.565631, 15.0], loss_val) self.assertAllClose([14.560061, 10.566417, 10.554007, 17.803982], per_sequence_loss_val) self.assertEqual(1, global_steps_val)
def _testDecoderFPropGradientCheckerHelper(self, func_inline=False): config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with self.session(graph=tf.Graph(), use_gpu=False, config=config) as sess: tf.set_random_seed(8372749040) np.random.seed(274854) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.dtype = tf.float64 dec = p.cls(p) src_seq_len = 5 src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)), tf.float64) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float64) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float64)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets) loss = metrics['loss'][0] all_vars = tf.all_variables() grads = tf.gradients(loss, all_vars) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)] tf.global_variables_initializer().run() test_utils.CompareToGoldenSingleFloat(self, 3.493656, loss.eval()) # Second run to make sure the function is determistic. test_utils.CompareToGoldenSingleFloat(self, 3.493656, loss.eval()) symbolic_grads = [x.eval() for x in dense_grads if x is not None] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v)) for x, y in zip(symbolic_grads, numerical_grads): self.assertAllClose(x, y)
def testDecoderConstruction(self): """Test that decoder can be constructed from params.""" p = self._DecoderParams( vn_config=py_utils.VariationalNoiseParams(None, True, False)) _ = decoder.AsrDecoder(p)
def testDecoderSampleTargetSequences(self): p = self._DecoderParams(vn_config=py_utils.VariationalNoiseParams( None, False, False), num_classes=8) p.target_seq_len = 5 p.random_seed = 1 config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions(do_function_inlining=False))) with self.session(use_gpu=False, config=config) as sess: tf.set_random_seed(8372740) np.random.seed(35315) dec = p.Instantiate() source_sequence_length = 5 batch_size = 4 source_encodings = tf.constant(np.random.normal( size=[source_sequence_length, batch_size, p.source_dim]), dtype=tf.float32) source_encoding_padding = tf.constant( [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap( encoded=source_encodings, padding=source_encoding_padding) sampled_sequences = dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.cast( 123, tf.int32)) self.assertAllEqual([batch_size, p.target_seq_len], sampled_sequences.ids.shape) tf.global_variables_initializer().run() decoder_output = sess.run(sampled_sequences) print('ids=%s' % np.array_repr(decoder_output.ids)) lens = np.sum(1 - decoder_output.paddings, axis=1) print('lens=%s' % lens) # pyformat: disable # pylint: disable=bad-whitespace,bad-continuation expected_ids = [[6, 2, 2, 2, 2], [0, 0, 7, 5, 1], [6, 1, 5, 1, 5], [6, 7, 7, 4, 4]] # pylint: enable=bad-whitespace,bad-continuation # pyformat: enable expected_lens = [2, 5, 5, 5] self.assertAllEqual(expected_lens, lens) self.assertAllEqual(expected_ids, decoder_output.ids) # Sample again with the same random seed. decoder_output2 = sess.run( dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.cast(123, tf.int32))) # Get the same output. self.assertAllEqual(decoder_output.ids, decoder_output2.ids) self.assertAllEqual(decoder_output.paddings, decoder_output2.paddings) # Sample again with a different random seed. decoder_output3 = sess.run( dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.cast( 123456, tf.int32))) # Get different sequences. self.assertNotAllClose(expected_ids, decoder_output3.ids)
def SetupXEnDecTransformerParams(p, name, vocab_size, model_dim, hidden_dim, num_heads, num_layers, learning_rate, warmup_steps, *, residual_dropout_prob=0.1, input_dropout_prob=0.0, atten_dropout_prob=0.0, relu_dropout_prob=0.0, label_smoothing_uncertainty=0.1, activation='RELU', add_unnormalized_residuals=True, atten_hidden_dim=0, use_dim_scale=False, num_shard=1): """Common model setup for different transformer models. Args: p: The initial params. name: An identifier for an instance of a transformer model. vocab_size: an integer representing the size of the vocabulary, probably 16000 or 32000. model_dim: dimension of the transformer block (column) hidden_dim: dimension of Feed-Forward neural network in each layer num_heads: number of attention heads to use for the transformer num_layers: number of layers in the transformer learning_rate: learning rate for Adam. For the base model, we use 1.0; for the big model, 3.0 warmup_steps: warmup steps for TransformerSchedule. For the base model, we use 4000; for the big model, 40000 residual_dropout_prob: dropout prob to the output of each sub-layer before it is added to the sub-layer input input_dropout_prob: dropout prob to the sums of the token embeddings and the position embeddings atten_dropout_prob: dropout prob to the attention weights in each Transformer attention sub-layer relu_dropout_prob: dropout prob to the inner layer output (ReLU activation) in each Transformer feed-forward sub-layer label_smoothing_uncertainty: if this value is 0, no label smoothing will be applied activation: Non-linearity for feed-forward layers. add_unnormalized_residuals: If set, uses un-normalized residuals in TransformerAttentionLayer atten_hidden_dim: Explicitly set attention hidden dim. use_dim_scale: Whether to enable dim_scale. num_shard: The number of shards for embedding matrics. Returns: A Params object containing the parameters that specify a transformer model (Vaswani 2017) """ p.name = name disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Xavier(1.0) attention_params_init = py_utils.WeightInit.Xavier(1.0 * (2**-0.5)) emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim)) p.encoder = encoder.TransformerXEncoder.Params() p.encoder.token_emb.Set( embedding_dim=model_dim, max_num_shards=num_shard, params_init=emb_params_init, vocab_size=vocab_size, vn=disable_vn, scale_sqrt_depth=True) p.encoder.position_emb.Set( embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) # Encoder TransformerStack params p.encoder.model_dim = model_dim p.encoder.transformer_stack.model_dim = model_dim p.encoder.transformer_stack.num_transformer_layers = num_layers p.encoder.transformer_stack.mask_self_atten = False p.encoder.input_dropout_prob = input_dropout_prob tr_atten_tpl = p.encoder.transformer_stack.transformer_tpl.tr_atten_tpl tr_atten_tpl.Set( num_attention_heads=num_heads, residual_dropout_prob=residual_dropout_prob, atten_dropout_prob=atten_dropout_prob, params_init=attention_params_init, add_unnormalized_input=add_unnormalized_residuals, atten_hidden_dim=atten_hidden_dim, vn=disable_vn) tr_atten_tpl.atten_tpl.Set( num_attention_heads=num_heads, enable_ctx_pre_proj=True, enable_ctx_post_proj=True, context_dim=model_dim, params_init=attention_params_init, vn=disable_vn) tr_atten_tpl.atten_tpl.inner_atten_params.Set(use_dim_scale=use_dim_scale) tr_fflayer_tpl = p.encoder.transformer_stack.transformer_tpl.tr_fflayer_tpl tr_fflayer_tpl.Set( hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, params_init=default_params_init, vn=disable_vn, activation=activation) tr_fflayer_tpl.fflayer_tpl.projection.Set(params_init=default_params_init) p.decoder = decoder.TransformerXDecoder.Params() p.decoder.source_dim = model_dim p.decoder.model_dim = model_dim p.decoder.num_trans_layers = num_layers p.decoder.input_dropout_prob = input_dropout_prob p.decoder.token_emb.Set( vocab_size=vocab_size, embedding_dim=model_dim, max_num_shards=num_shard, params_init=emb_params_init, vn=disable_vn, scale_sqrt_depth=True) p.decoder.position_emb.Set( embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) p.decoder.trans_tpl.source_dim = model_dim tr_atten_tpl = p.decoder.trans_tpl.tr_atten_tpl tr_atten_tpl.Set( source_dim=model_dim, num_attention_heads=num_heads, residual_dropout_prob=residual_dropout_prob, atten_dropout_prob=atten_dropout_prob, params_init=attention_params_init, add_unnormalized_input=add_unnormalized_residuals, atten_hidden_dim=atten_hidden_dim, vn=disable_vn) tr_atten_tpl.atten_tpl.Set( enable_ctx_pre_proj=True, enable_ctx_post_proj=True, context_dim=model_dim, params_init=attention_params_init, enable_per_dim_scale=use_dim_scale, vn=disable_vn) tr_atten_tpl.atten_tpl.inner_atten_params.Set(use_dim_scale=use_dim_scale) p.decoder.trans_tpl.tr_fflayer_tpl.Set( input_dim=model_dim, hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, params_init=default_params_init, vn=disable_vn, activation=activation) p.decoder.trans_tpl.tr_fflayer_tpl.fflayer_tpl.projection.Set( params_init=default_params_init) p.decoder.softmax.Set( num_classes=vocab_size, vn=disable_vn, params_init=emb_params_init, num_shards=num_shard) p.decoder.per_word_avg_loss = True p.decoder.label_smoothing = layers.UniformLabelSmoother.Params() p.decoder.label_smoothing.num_classes = vocab_size p.decoder.label_smoothing.uncertainty = label_smoothing_uncertainty p.decoder.per_example_tensors = True p.decoder.trans_tpl.tr_atten_tpl.pre_layer_norm = False p.decoder.trans_tpl.tr_fflayer_tpl.pre_layer_norm = False p.encoder.transformer_stack.transformer_tpl.tr_atten_tpl.pre_layer_norm = False p.encoder.transformer_stack.transformer_tpl.tr_fflayer_tpl.pre_layer_norm = False p.train.Set( learning_rate=learning_rate, optimizer=optimizer.Adam.ParamsB(), clip_gradient_norm_to_value=0.0, grad_norm_to_clip_to_zero=0.0, lr_schedule=schedule.TransformerSchedule.Params().Set( warmup_steps=warmup_steps, worker_replicas=1, model_dim=model_dim)) p.eval.samples_per_summary = 12000 return p
def DefaultVN(): return py_utils.VariationalNoiseParams(None, False, False)
def testDecoderFPropWithAdapters(self): """Create decoder with adapters, and verify that FProp runs.""" with self.session(use_gpu=False): tf.random.set_seed(8372749040) params = _DecoderParams( num_rnn_layers=2, vn_config=py_utils.VariationalNoiseParams( None, True, False, seed=12345)) params.rnn_cell_dim = 3 params.adapter_layer_tpl.Set( bottleneck_dim=4, num_tasks=16, projection_params_init=py_utils.WeightInit.Gaussian(0.01)) params.adapter_task_id_field = 'domain_ids' dec = params.Instantiate() src_seq_len = 5 src_enc = tf.random.normal([src_seq_len, 2, 8], seed=982774838, dtype=py_utils.FPropDtype(params)) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=py_utils.FPropDtype(params)) domain_ids = tf.constant(np.random.randint(low=0, high=16, size=[2])) encoder_outputs = py_utils.NestedMap( encoded=src_enc, padding=src_enc_padding, domain_ids=domain_ids) # shape=[4, 5] target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) # shape=[4, 5] target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) # shape=[4, 5] target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 0]], dtype=py_utils.FPropDtype(params))) target_transcripts = tf.constant(['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings # ids/labels/weights/paddings are all in [batch, time] shape. targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) decoder_outputs = dec.FPropDefaultTheta(encoder_outputs, targets) metrics = decoder_outputs.metrics per_sequence_loss = decoder_outputs.per_sequence['loss'] self.assertIn('fraction_of_correct_next_step_preds', metrics) self.evaluate(tf.global_variables_initializer()) metrics_val, per_sequence_loss_val = self.evaluate( [metrics, per_sequence_loss]) tf.logging.info('metrics=%s, per_sequence_loss=%s', metrics_val, per_sequence_loss_val) self.assertEqual(metrics_val['loss'], metrics_val['log_pplx']) # Target batch size is 4. Therefore, we should expect 4 here. self.assertEqual(per_sequence_loss_val.shape, (4,))
def testForwardPassWithStackingAfterFinalLayer(self): with self.session(use_gpu=False): vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._EncoderParams(vn_config) p.stacking_layer_tpl.left_context = 1 p.stacking_layer_tpl.right_context = 0 p.stacking_layer_tpl.stride = 2 p.layer_index_before_stacking = 1 enc_out = self._ForwardPass(p).encoded enc_out_sum = tf.reduce_sum(enc_out, 0) tf.global_variables_initializer().run() # pyformat: disable # pylint: disable=bad-whitespace expected_enc_out = [ [ -1.25796525e-02, -2.32883729e-02, 7.40477070e-03, -4.51436592e-03, -5.84740378e-03, 2.30195466e-03, -3.08505213e-03, 4.05658083e-03, -8.12252797e-03, -1.08030904e-02, -4.17955732e-03, -3.73707339e-03, 6.97144482e-04, 2.79850606e-03, 8.33133236e-04, -5.75614115e-03, -1.10648498e-02, -1.20132393e-03, -1.69872947e-03, 6.97519444e-03, 2.46211258e-03, -1.28190573e-02, -8.66306946e-05, -6.09322963e-03, 7.14540575e-03, -5.67986863e-05, 5.17684873e-03, 1.18097477e-02, 1.74862407e-02, 9.13049746e-03, 7.31027778e-03, 4.83186450e-05, -1.38104409e-02, -2.56096497e-02, 1.04327593e-02, -5.15327370e-03, -8.69584084e-03, 1.33647269e-03, -1.84873224e-03, 5.81806153e-03, -1.17716007e-02, -1.23606063e-02, -2.58761784e-03, -6.46180846e-03, 4.11718246e-03, 6.22369815e-03, 4.84800315e-04, -8.21352564e-03, -1.25989169e-02, 6.75740885e-04, -2.09423108e-03, 4.02465323e-03, 6.08023722e-03, -1.15798926e-02, -6.19094400e-03, -1.03260633e-02, 8.31142440e-03, 3.74771934e-03, 7.58658582e-03, 1.32339774e-02, 2.02648211e-02, 8.03512800e-03, 1.21787926e-02, 4.27130330e-03 ], [ -5.94401825e-03, 4.23503201e-03, -7.39302021e-03, 3.84659087e-03, 2.92047067e-03, -2.28955783e-03, 7.80778937e-05, 7.74920732e-03, -1.29534695e-02, -1.44997425e-02, 3.00848205e-03, -1.33561785e-04, 7.31927902e-03, -2.24683899e-03, -6.27679843e-03, -5.35295857e-03, -5.39031485e-03, -4.90641687e-05, 4.03603073e-03, -1.08133641e-03, 9.59445070e-03, 9.81783494e-03, 8.77558347e-03, -5.13678743e-03, 7.19959754e-03, 3.93835502e-03, -6.01979066e-03, 6.13247836e-03, 1.39782019e-03, 4.60287556e-04, 1.04263611e-02, -9.61792190e-03, -1.02399308e-02, 8.54056142e-03, -1.22422148e-02, 6.58972748e-03, 3.18149826e-03, -2.79453350e-03, -9.98417381e-04, 1.77927073e-02, -2.28664111e-02, -2.73113251e-02, 6.44177478e-03, -5.66864444e-04, 1.58752780e-02, 2.18148530e-03, -1.31809842e-02, -9.98921506e-03, -9.63711366e-03, 1.11398206e-03, 4.28507291e-03, -3.02007422e-04, 1.06751733e-02, 1.15796775e-02, 1.35387452e-02, -1.02765551e-02, 1.11750513e-02, 4.31185029e-03, -1.04119312e-02, 8.54373723e-03, 4.97616245e-04, -3.82199232e-03, 2.10159980e-02, -1.68744288e-02 ] ] # pylint: enable=bad-whitespace # pyformat: enable enc_out_sum_val = enc_out_sum.eval() print('expected enc_out_sum_val', enc_out_sum_val) self.assertAllClose(expected_enc_out, enc_out_sum_val)
def SetupTransformerDecoder(model_dim, vocab_size, num_layers, num_heads, hidden_dim, residual_dropout_prob=0.1, input_dropout_prob=0.0, atten_dropout_prob=0.0, relu_dropout_prob=0.0, label_smoothing_uncertainty=0.1, is_transparent=False, activation='RELU', add_unnormalized_residuals=False, atten_hidden_dim=0): """Common setup for transformer model decoder.""" disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Xavier(1.0) emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim)) # Decoder decoder_params = decoder.TransformerDecoder.Params() decoder_params.source_dim = model_dim decoder_params.model_dim = model_dim decoder_params.num_trans_layers = num_layers decoder_params.input_dropout_prob = input_dropout_prob decoder_params.token_emb.Set(vocab_size=vocab_size, embedding_dim=model_dim, max_num_shards=16, params_init=emb_params_init, vn=disable_vn, scale_sqrt_depth=True) decoder_params.position_emb.Set(embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) decoder_params.trans_tpl.source_dim = model_dim decoder_params.trans_tpl.tr_atten_tpl.Set( source_dim=model_dim, num_attention_heads=num_heads, residual_dropout_prob=residual_dropout_prob, atten_dropout_prob=atten_dropout_prob, params_init=default_params_init, add_unnormalized_input=add_unnormalized_residuals, atten_hidden_dim=atten_hidden_dim, vn=disable_vn) decoder_params.trans_tpl.tr_atten_tpl.atten_tpl.Set( enable_ctx_pre_proj=True, enable_ctx_post_proj=True, context_dim=model_dim, vn=disable_vn) decoder_params.trans_tpl.tr_fflayer_tpl.Set( input_dim=model_dim, hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, params_init=default_params_init, vn=disable_vn, activation=activation) decoder_params.softmax.Set(num_classes=vocab_size, vn=disable_vn, params_init=emb_params_init, num_shards=16) decoder_params.per_word_avg_loss = True decoder_params.label_smoothing = layers.UniformLabelSmoother.Params() decoder_params.label_smoothing.num_classes = vocab_size decoder_params.label_smoothing.uncertainty = label_smoothing_uncertainty if is_transparent: decoder_params.is_transparent = True return decoder_params
def SetupTransformerEncoder(model_dim, vocab_size, num_layers, num_heads, hidden_dim, residual_dropout_prob=0.1, input_dropout_prob=0.0, atten_dropout_prob=0.0, relu_dropout_prob=0.0, is_transparent=False, activation='RELU', add_unnormalized_residuals=False, atten_hidden_dim=0): """Common setup for transformer model encoder. Args: model_dim: specifies dimension of transformer layers, token embeddings, and positional embeddings as well context vectors (attention values). vocab_size: for token embeddings. num_layers: number of transformer layers. num_heads: number of attention heads. hidden_dim: in transformer feedforward layer. residual_dropout_prob: used in transformer feedforward and attention layer. input_dropout_prob: input dropout. atten_dropout_prob: used in attention layer. relu_dropout_prob: used in transformer feedforward layer. is_transparent: if set, outputs a merger of embeddings and layer outputs. activation: Non-linearity for feed-forward layers. add_unnormalized_residuals: If set, uses un-normalized residuals in TransformerAttentionLayer atten_hidden_dim: Explicitly set attention hidden dim. Returns: Encoder params. """ disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Xavier(1.0) emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim)) # Encoder encoder_params = encoder.TransformerEncoder.Params() encoder_params.token_emb.Set(embedding_dim=model_dim, max_num_shards=16, params_init=emb_params_init, vocab_size=vocab_size, vn=disable_vn, scale_sqrt_depth=True) encoder_params.position_emb.Set(embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) # Encoder TransformerStack params encoder_params.model_dim = model_dim encoder_params.transformer_stack.model_dim = model_dim encoder_params.transformer_stack.num_transformer_layers = num_layers encoder_params.input_dropout_prob = input_dropout_prob encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.Set( num_attention_heads=num_heads, residual_dropout_prob=residual_dropout_prob, atten_dropout_prob=atten_dropout_prob, params_init=default_params_init, add_unnormalized_input=add_unnormalized_residuals, atten_hidden_dim=atten_hidden_dim, vn=disable_vn) encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.atten_tpl.Set( num_attention_heads=num_heads, enable_ctx_pre_proj=True, enable_ctx_post_proj=True, context_dim=model_dim, vn=disable_vn) encoder_params.transformer_stack.transformer_tpl.tr_fflayer_tpl.Set( hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, params_init=default_params_init, vn=disable_vn, activation=activation) if is_transparent: encoder_params.transformer_stack.is_transparent = True return encoder_params
def _testDecoderFPropFloatHelper(self, func_inline=False, num_decoder_layers=1, target_seq_len=5, residual_start=0): """Computes decoder from params and computes loss with random inputs.""" cluster = cluster_factory.ForTestingWorker(add_summary=True) config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with cluster, self.session(graph=tf.Graph(), use_gpu=False, config=config) as sess: tf.set_random_seed(8372749040) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.rnn_layers = num_decoder_layers p.residual_start = residual_start p.target_seq_len = target_seq_len dec = p.cls(p) src_seq_len = 5 src_enc = tf.random_normal([src_seq_len, 2, 8], seed=9283748) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float32)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets) loss = metrics['loss'][0] correct_predicts = metrics['fraction_of_correct_next_step_preds'][ 0] summaries = tf.summary.merge( tf.get_collection(tf.GraphKeys.SUMMARIES)) tf.global_variables_initializer().run() loss_v, _ = sess.run([loss, correct_predicts]) summaries.eval() return loss_v
def testEncoderConstruction(self): vn_config = py_utils.VariationalNoiseParams(None, True, False) p = self._EncoderParams(vn_config) _ = encoder.AsrEncoder(p)
def Params(cls): p = super(MTDecoderV1, cls).Params() # Shared embedding. p.Define('emb', layers.EmbeddingLayer.Params(), 'Embedding layer params.') p.Define('source_dim', 1024, 'Dimension of the source encoding.') p.Define('attention', attention.AdditiveAttention.Params(), 'Additive attention params.') p.Define('atten_rnn_cell_tpl', rnn_cell.LSTMCellSimple.Params(), 'Attention RNNCell params template.') p.Define('rnn_cell_tpl', rnn_cell.LSTMCellSimple.Params(), 'RNNCell params template.') p.Define('rnn_cell_dim', 1024, 'size of the rnn cells.') p.Define('rnn_layers', 8, 'Number of rnn layers.') p.Define('residual_start', 2, 'Start residual connections from this layer.') p.Define('atten_rnn_cls', rnn_layers.FRNNWithAttention, 'Which atten rnn cls to use.') p.Define('use_prev_atten_ctx', False, 'If True, all decoder layers use previous attention context as ' 'input. Otherwise, only first decoder layer uses previous ' 'attention context and the rest of the layers use current ' 'attention context.') p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.') # Default value was mildly tuned. Could be further tuned in the future. p.Define('qlogsoftmax_range_min', -10.0, 'Quantization of the output of ' 'log softmax.') p.Define( 'use_zero_atten_state', False, 'To use zero attention state ' 'instead of computing attention with zero query vector.') p.Define('cc_schedule', None, 'Clipping cap schedule.') disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Uniform(0.04) # Default config for the embedding. p.emb.vn = disable_vn p.emb.vocab_size = 32000 p.emb.embedding_dim = 1024 p.emb.max_num_shards = 16 p.emb.params_init = default_params_init # Default config for the attention model. p.attention.vn = disable_vn p.attention.hidden_dim = 1024 p.attention.params_init = None # Filled in after dims are known. # Default config for the attention rnn cell. p.atten_rnn_cell_tpl.vn = disable_vn p.atten_rnn_cell_tpl.params_init = default_params_init # Default config for the rnn cell. p.rnn_cell_tpl.vn = disable_vn p.rnn_cell_tpl.params_init = default_params_init # Default config for the softmax part. p.softmax.vn = disable_vn p.softmax.num_classes = 32000 # 32k p.softmax.num_shards = 16 p.softmax.params_init = default_params_init # Default config for beam search. p.target_seq_len = 300 p.beam_search.length_normalization = 0.2 p.beam_search.coverage_penalty = 0.2 return p