def universal_transformer_tiny(): hparams = transformer.transformer_tiny() hparams = update_hparams_for_universal_transformer(hparams) # anywhere that we get good scores for the tiny model, it's with "sepconv" hparams.transformer_ffn_type = "sepconv" hparams.num_rec_steps = 8 return hparams
def transformer_extra_tiny(): hparams = transformer.transformer_tiny() hparams.num_hidden_layers = 1 hparams.hidden_size = 32 hparams.filter_size = 128 hparams.num_heads = 2 return hparams
def testSlowVsFast(self): model, features = get_model(transformer.transformer_tiny()) decode_length = 30 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(10): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) with tf.variable_scope(tf.get_variable_scope(), reuse=True): greedy_result = model._slow_greedy_infer(features, decode_length)["outputs"] greedy_result = tf.squeeze(greedy_result, axis=[2, 3]) fast_result = model._greedy_infer(features, decode_length)["outputs"] with self.test_session(): greedy_res = greedy_result.eval() fast_res = fast_result.eval() self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length)) self.assertAllClose(greedy_res, fast_res)
def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN, has_input=True, model_cls=transformer.Transformer): if hparams is None: hparams = transformer.transformer_tiny() hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE, hparams) if not has_input: del p_hparams.modality["inputs"] hparams.problem_hparams = p_hparams inputs = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) targets = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "targets": tf.constant(targets, dtype=tf.int32, name="targets"), "target_space_id": tf.constant(1, dtype=tf.int32) } if has_input: features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs") return model_cls(hparams, mode, p_hparams), features
def _create_greedy_infer_model(self): """Creates model for greedy inference testing. Returns: model: A t2t model. features: An map of string to tensor. """ model, features = get_model(transformer.transformer_tiny()) out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(10): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) return model, features
def m2m_m_transformer_hparams(): hparams = transformer_tiny() hparams.num_hidden_layers = 4 hparams.hidden_size = 128 hparams.filter_size = 512 hparams.num_heads = 4 return hparams
def _testTransformer(self, net): batch_size = 3 input_length = 5 target_length = 7 vocab_size = 9 hparams = transformer.transformer_tiny() p_hparams = problem_hparams.test_problem_hparams( hparams, vocab_size, vocab_size) inputs = -1 + np.random.random_integers( vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( vocab_size, size=(batch_size, target_length, 1, 1)) with self.test_session() as session: features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } model = net(hparams, p_hparams) shadred_logits, _, _ = model.model_fn(features, True) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (batch_size, target_length, 1, 1, vocab_size))
def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN, has_input=True, model_cls=transformer.Transformer): if hparams is None: hparams = transformer.transformer_tiny() hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE, hparams) if not has_input: del p_hparams.modality["inputs"] hparams.problem_hparams = p_hparams inputs = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) targets = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "targets": tf.constant(targets, dtype=tf.int32, name="targets"), "target_space_id": tf.constant(1, dtype=tf.int32) } if has_input: features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs") return model_cls(hparams, mode, p_hparams), features
def transformer_extra_tiny_agg(): hparams = transformer.transformer_tiny() hparams.batch_size = 8 hparams.num_hidden_layers = 1 hparams.hidden_size = 64 hparams.filter_size = 256 hparams.num_heads = 2 return hparams
def testEvolvedTransformer(self): model, features = get_model(hparams=transformer.transformer_tiny()) logits, _ = model(features) with self.test_session() as session: session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
def dstc_transformer_hparams_v8(): hparams = transformer_tiny() hparams.num_hidden_layers = 4 hparams.dropout = 0.7 hparams.hidden_size = 128 hparams.filter_size = 512 hparams.num_heads = 4 return hparams
def t_rel_len2048_dropout15_tiny(): """Hparams for LM with relative attention, tiny transformer.""" # hparams = transformer.transformer_base() hparams = transformer.transformer_tiny() update_transformer_hparams_for_music(hparams) update_truncate_length(hparams, 2048) update_dropout(hparams, 0.15) hparams.self_attention_type = "dot_product_relative_v2" # Need to specify num_hidden_layers hparams.attention_key_channels = 512 hparams.num_hidden_layers = 8 return hparams
def get_model(): hparams = transformer.transformer_tiny() hparams.layer_prepostprocess_dropout = 0.0 p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE, hparams) hparams.problem_hparams = p_hparams inputs = np.random.randint(VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) targets = np.random.randint( VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "targets": tf.constant(targets, dtype=tf.int32, name="targets"), "target_space_id": tf.constant(1, dtype=tf.int32), "inputs": tf.constant(inputs, dtype=tf.int32, name="inputs"), } return (evolved_transformer.EvolvedTransformer( hparams, tf.estimator.ModeKeys.TRAIN, p_hparams), features)
def transformer_encoding(node_seq_input, num_nodes, params, mode): """Construct a node-level encoder based on the transformer module. Args: node_seq_input : tf.Tensor. A tensor with 3 dimensions. num_nodes: tf.Tensor. Number of nodes per instance. params : dict. A parameter dictionary. mode : tf.estimator.ModeKeys object. Returns: node_seq_output: tf.Tensor. A tensor with 3 dimensions. """ node_weights = tf.sequence_mask(num_nodes) hparams = transformer.transformer_tiny() hparams.hidden_size = params["transformer_hidden_unit"] hparams.num_heads = params["transformer_head"] hparams.num_hidden_layers = params["transformer_hidden_layer"] if hparams.hidden_size % hparams.num_heads != 0: raise ValueError( "The hidden_size needs to be divisible by trans_head.") transformer_encoder = transformer.TransformerEncoder(hparams, mode=mode) # Input shape [batch_size, sequence_length, 1, hidden_dim]. node_seq_input = tf.layers.dense(node_seq_input, hparams.hidden_size) node_seq_input_reshape = tf.expand_dims(node_seq_input, 2) # Targets and target_space_id are required by decoder of transformer, # are both set as 0 for encoder. node_seq_output = transformer_encoder( { "inputs": node_seq_input_reshape, "targets": 0, "target_space_id": 0, }, nonpadding=node_weights) node_seq_output = tf.squeeze(node_seq_output[0], 2) # Construct a residue network by adding up the input and output node_seq_output = tf.add(node_seq_input, node_seq_output) return node_seq_output
def transformer_hparams(hidden_size): """Creates hyperpameters for autoregressive prior. Args: hidden_size: Width of attention layers and neural network output layer. Returns: hparams: Hyperpameters with basic presets for a Transformer. """ hparams = transformer.transformer_tiny() hparams.add_hparam("shared_rel", False) hparams.add_hparam("q_filter_width", 1) hparams.add_hparam("kv_filter_width", 1) hparams.hidden_size = hidden_size hparams.num_layers = 6 hparams.layer_prepostprocess_dropout = 0. hparams.attention_dropout = 0. hparams.relu_dropout = 0. hparams.block_length = 1 hparams.block_width = 1 hparams.ffn_layer = "conv_hidden_relu" return hparams
def testBeamVsFast(self): model, features = get_model(transformer.transformer_tiny()) decode_length = 30 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(10): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) with tf.variable_scope(tf.get_variable_scope(), reuse=True): beam_result = model._beam_decode_slow(features, decode_length, beam_size=4, top_beams=1, alpha=1.0)["outputs"] fast_result = model._beam_decode(features, decode_length, beam_size=4, top_beams=1, alpha=1.0)["outputs"] with self.test_session(): beam_res = beam_result.eval() fast_res = fast_result.eval() self.assertAllClose(beam_res, fast_res)
def _testTransformer(self, net): batch_size = 3 input_length = 5 target_length = 7 vocab_size = 9 hparams = transformer.transformer_tiny() p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size, vocab_size) inputs = -1 + np.random.random_integers( vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( vocab_size, size=(batch_size, target_length, 1, 1)) with self.test_session() as session: features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } model = net(hparams, p_hparams) shadred_logits, _, _ = model.model_fn(features, True) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (batch_size, target_length, 1, 1, vocab_size))
def universal_transformer_tiny1(): hparams = transformer.transformer_tiny() hparams = update_hparams_for_universal_transformer(hparams) hparams.num_rec_steps = 8 return hparams
def transformer_aux_tiny(): """Set of hyperparameters.""" hparams = transformer.transformer_tiny() hparams.shared_embedding_and_softmax_weights = False hparams.add_hparam("shift_values", "1,2") return hparams
def transformer_tiny_bs3(): hparams = transformer.transformer_tiny() hparams.add_hparam("block_size", 3) return hparams
def r_transformer_tiny(): hparams = transformer.transformer_tiny() hparams = update_hparams_for_r_transformer(hparams) hparams.num_rec_steps = 8 return hparams
def evolved_transformer_tiny(): """Base parameters for Evolved Transformer model.""" hparams = add_evolved_transformer_hparams(transformer.transformer_tiny()) hparams.learning_rate_schedule = ("constant*single_cycle_cos_decay") return hparams
def transformer_aux_tiny(): """Set of hyperparameters.""" hparams = transformer.transformer_tiny() hparams.shared_embedding_and_softmax_weights = False hparams.add_hparam("shift_values", "1,2") return hparams
def universal_transformer_tiny(): hparams = transformer.transformer_tiny() hparams = update_hparams_for_universal_transformer(hparams) hparams.num_rec_steps = 8 return hparams