Ejemplo n.º 1
0
  def _create_greedy_infer_model(self):
    """Creates model for greedy inference testing.

    Returns:
      model: A t2t model.
      features: An map of string to tensor.
    """
    model, features = get_model(transformer.transformer_small())

    out_logits, _ = model(features)
    out_logits = tf.squeeze(out_logits, axis=[2, 3])
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
        labels=tf.reshape(features["targets"], [-1]))
    loss = tf.reduce_mean(loss)
    apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)

    with self.test_session():
      tf.global_variables_initializer().run()
      for _ in range(100):
        apply_grad.run()

    model.set_mode(tf.estimator.ModeKeys.PREDICT)

    return model, features
Ejemplo n.º 2
0
 def testTransformer(self):
   model, features = get_model(transformer.transformer_small())
   logits, _ = model(features)
   with self.test_session() as session:
     session.run(tf.global_variables_initializer())
     res = session.run(logits)
   self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
Ejemplo n.º 3
0
def transformer_ae_small():
  """Set of hyperparameters."""
  hparams = transformer.transformer_small()
  hparams.batch_size = 2048
  hparams.learning_rate_warmup_steps = 4000
  hparams.num_hidden_layers = 3
  hparams.hidden_size = 384
  hparams.filter_size = 2048
  hparams.label_smoothing = 0.0
  hparams.add_hparam("z_size", 16)
  hparams.add_hparam("noise_dev", 0.0)
  hparams.add_hparam("d_mix", 0.5)
  # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae.
  hparams.add_hparam("bottleneck_kind", "semhash")
  hparams.add_hparam("do_ae", True)
  hparams.add_hparam("do_mask", True)
  hparams.add_hparam("do_refine", False)
  hparams.add_hparam("drop_inputs", False)
  hparams.add_hparam("v_size", 1024*64)
  hparams.add_hparam("max_context_length", 64)
  hparams.add_hparam("num_compress_steps", 3)
  hparams.add_hparam("kl_steps", 35000)
  hparams.add_hparam("startup_steps", 10000)
  hparams.add_hparam("kmeans_lr_factor", 0.002)
  hparams.add_hparam("z_dropout", 0.1)
  hparams.add_hparam("is_2d", 0)
  hparams.add_hparam("use_gumbel_softmax", True)
  hparams.add_hparam("softmax_k", 0)
  hparams.add_hparam("decode_autoregressive", True)
  hparams.add_hparam("do_vae", True)
  hparams.add_hparam("bit_vae", True)
  hparams.add_hparam("beta", 0.25)
  hparams.kl_warmup_steps = 150000
  hparams.force_full_predict = True
  return hparams
Ejemplo n.º 4
0
  def testSlowVsFast(self):
    model, features = get_model(transformer.transformer_small())

    decode_length = 3

    out_logits, _ = model(features)
    out_logits = tf.squeeze(out_logits, axis=[2, 3])
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
        labels=tf.reshape(features["targets"], [-1]))
    loss = tf.reduce_mean(loss)
    apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)

    with self.test_session():
      tf.global_variables_initializer().run()
      for _ in range(100):
        apply_grad.run()

    model.set_mode(tf.estimator.ModeKeys.PREDICT)

    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
      greedy_result = model._slow_greedy_infer(
          features, decode_length)["outputs"]
      greedy_result = tf.squeeze(greedy_result, axis=[2, 3])

      fast_result = model._greedy_infer(features, decode_length)["outputs"]

    with self.test_session():
      greedy_res = greedy_result.eval()
      fast_res = fast_result.eval()

    self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
    self.assertAllClose(greedy_res, fast_res)
Ejemplo n.º 5
0
def small_librispeech_model(param_overrides=None):
    hparams = transformer.transformer_small()
    hparams.hidden_size = 8
    hparams.filter_size = 32
    hparams.num_heads = 1
    hparams.layer_prepostprocess_dropout = 0.0
    p_hparams = librispeech.Librispeech().get_hparams(hparams)
    p_hparams.vocab_size["targets"] = VOCAB_SIZE
    hparams.problem_hparams = p_hparams
    model = transformer.Transformer(hparams, problem_hparams=p_hparams)
    if param_overrides is not None:  # Add or Set any provided HParams
        assert isinstance(param_overrides, dict)
        for param_name in param_overrides:
            if hasattr(hparams, param_name):
                hparams.set_hparam(param_name, param_overrides[param_name])
            else:
                hparams.add_hparam(param_name, param_overrides[param_name])
    inputs = np.random.rand(BATCH_SIZE, INPUT_LENGTH, 80,
                            3).astype("float32")  # modify for speech
    targets = np.random.randint(VOCAB_SIZE,
                                size=(BATCH_SIZE, TARGET_LENGTH, 1, 1))
    features = {
        "inputs": tf.constant(inputs, dtype=tf.float32, name="inputs"),
        "targets": tf.constant(targets, dtype=tf.int32, name="targets"),
        "target_space_id": tf.constant(1, dtype=tf.int32)
    }
    return model, features
Ejemplo n.º 6
0
    def testSlowVsFastNoInput(self):
        model, features = get_model(transformer.transformer_small(),
                                    has_input=False)

        decode_length = 3

        out_logits, _ = model(features)
        out_logits = tf.squeeze(out_logits, axis=[2, 3])
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
            labels=tf.reshape(features["targets"], [-1]))
        loss = tf.reduce_mean(loss)
        apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)

        with self.test_session():
            tf.global_variables_initializer().run()
            for _ in range(100):
                apply_grad.run()

        model.set_mode(tf.estimator.ModeKeys.PREDICT)

        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            slow_result = model._slow_greedy_infer(features,
                                                   decode_length)["outputs"]
            slow_result = tf.squeeze(slow_result, axis=[2, 3])

            fast_result = model._greedy_infer(features,
                                              decode_length)["outputs"]

        with self.test_session():
            slow_res = slow_result.eval()
            fast_res = fast_result.eval()

        self.assertEqual(slow_res.shape, (BATCH_SIZE, decode_length))
        self.assertAllClose(slow_res, fast_res)
Ejemplo n.º 7
0
    def _create_greedy_infer_model(self):
        """Creates model for greedy inference testing.

    Returns:
      model: A t2t model.
      features: An map of string to tensor.
    """
        model, features = get_model(transformer.transformer_small())

        out_logits, _ = model(features)
        out_logits = tf.squeeze(out_logits, axis=[2, 3])
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
            labels=tf.reshape(features["targets"], [-1]))
        loss = tf.reduce_mean(loss)
        apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)

        with self.test_session():
            tf.global_variables_initializer().run()
            for _ in range(100):
                apply_grad.run()

        model.set_mode(tf.estimator.ModeKeys.PREDICT)

        return model, features
 def testTransformer(self):
   model, features = self.getModel(transformer.transformer_small())
   logits, _ = model(features)
   with self.test_session() as session:
     session.run(tf.global_variables_initializer())
     res = session.run(logits)
   self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
def transformer_ae_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate_warmup_steps = 4000
    hparams.num_hidden_layers = 3
    hparams.hidden_size = 384
    hparams.filter_size = 2048
    hparams.label_smoothing = 0.0
    hparams.add_hparam("z_size", 16)
    hparams.add_hparam("noise_dev", 1.0)
    hparams.add_hparam("d_mix", 0.5)
    # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae.
    hparams.add_hparam("bottleneck_kind", "semhash")
    hparams.add_hparam("do_ae", True)
    hparams.add_hparam("do_mask", True)
    hparams.add_hparam("do_refine", True)
    hparams.add_hparam("drop_inputs", False)
    hparams.add_hparam("v_size", 1024 * 64)
    hparams.add_hparam("max_context_length", 64)
    hparams.add_hparam("num_compress_steps", 3)
    hparams.add_hparam("kl_steps", 35000)
    hparams.add_hparam("startup_steps", 10000)
    hparams.add_hparam("kmeans_lr_factor", 0.002)
    hparams.add_hparam("z_dropout", 0.1)
    hparams.add_hparam("is_2d", 0)
    hparams.add_hparam("use_gumbel_softmax", True)
    hparams.add_hparam("softmax_k", 0)
    hparams.add_hparam("decode_autoregressive", True)
    hparams.add_hparam("do_vae", True)
    hparams.add_hparam("bit_vae", True)
    hparams.add_hparam("beta", 0.25)
    hparams.kl_warmup_steps = 150000
    return hparams
Ejemplo n.º 10
0
def transformer_ae_small():
  """Set of hyperparameters."""
  hparams = transformer.transformer_small()
  hparams.batch_size = 2048
  hparams.learning_rate = 0.2
  hparams.learning_rate_warmup_steps = 4000
  hparams.num_hidden_layers = 3
  hparams.hidden_size = 384
  hparams.filter_size = 2048
  hparams.add_hparam("compress_filter_size", 2048 * 2)
  hparams.label_smoothing = 0.0
  hparams.optimizer = "Adam"  # Can be unstable, maybe try Adam.
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.997  # Needs tuning, try 0.98 to 0.999.
  hparams.add_hparam("z_size", 14)
  hparams.add_hparam("noise_dev", 0.5)
  hparams.add_hparam("d_mix", 0.5)
  # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq.
  hparams.add_hparam("bottleneck_kind", "semhash")
  hparams.add_hparam("num_blocks", 1)
  hparams.add_hparam("num_decode_blocks", 1)
  # Reshape method for DVQ: slice, project
  hparams.add_hparam("reshape_method", "slice")
  hparams.add_hparam("trainable_projections", False)
  # Hparams for Dirichlet process process
  hparams.add_hparam("dp_alpha", 0.5)
  hparams.add_hparam("dp_strength", 0.25)
  hparams.add_hparam("dp_decay", 1.0)
  hparams.add_hparam("unmasked_percentage", 0.1)
  hparams.add_hparam("do_ae", True)
  hparams.add_hparam("do_mask", True)
  hparams.add_hparam("do_refine", False)
  hparams.add_hparam("do_attend_compress", False)
  hparams.add_hparam("do_attend_decompress", True)
  hparams.add_hparam("do_residual_compress", False)
  hparams.add_hparam("drop_inputs", False)
  hparams.add_hparam("v_size", 1024*64)
  hparams.add_hparam("max_context_length", 64)
  hparams.add_hparam("num_compress_steps", 3)
  hparams.add_hparam("startup_steps", 10000)
  hparams.add_hparam("mask_startup_steps", 50000)
  hparams.add_hparam("kmeans_lr_factor", 0.002)
  hparams.add_hparam("z_dropout", 0.1)
  hparams.add_hparam("is_2d", 0)
  hparams.add_hparam("softmax_k", 0)
  hparams.add_hparam("decode_autoregressive", True)
  hparams.add_hparam("do_vae", True)
  hparams.add_hparam("bit_vae", True)
  hparams.add_hparam("beta", 0.25)
  hparams.add_hparam("epsilon", 1e-5)
  hparams.add_hparam("decay", 0.999)
  hparams.add_hparam("ema", True)
  hparams.add_hparam("random_top_k", 1)
  hparams.kl_warmup_steps = 150000
  hparams.force_full_predict = True

  # task params
  hparams.add_hparam("task", "translate")  # translate or image tasks supported
  return hparams
 def get_hparams(self):
     hparams = transformer.transformer_small()
     hparams.add_hparam("prior_type", "affine")
     hparams.add_hparam("factor", 2)  # squeezing factor
     hparams.add_hparam("n_layers_transform_params", 1)
     hparams.add_hparam("n_1x1_heads", N_1X1_HEADS)
     hparams.add_hparam("flow_num_1x1_heads", 4)
     hparams.add_hparam("flow_num_heads", 4)
     hparams.add_hparam("flow_model_d", 64)
     hparams.add_hparam("flow_d_ff", 128)
     hparams.add_hparam("flow_layer_prepostprocess_dropout", 0.0)
     hparams.add_hparam("flow_attention_dropout", 0.0)
     hparams.add_hparam("flow_relu_dropout", 0.0)
     hparams.add_hparam("latent_size", N_CHANNELS)
     hparams.add_hparam("use_weightnorm", True)
     hparams.add_hparam("kl_startup_steps", 2000)
     hparams.add_hparam("affine_scale", "glow")
     hparams.add_hparam("scale_width", 0.999)
     hparams.add_hparam("step_fn", "glow")  # glow / chunting
     hparams.add_hparam("conv_fn", "np")  # np / tf
     hparams.add_hparam("posterior_type", "diagonal_normal")
     hparams.causal_decoder_self_attention = False
     hparams.model_d = model_d
     hparams.weight_dtype = "float32"
     hparams.add_hparam("pos_attn", False)
     return hparams
Ejemplo n.º 12
0
def transformer_sketch():
    """Basic transformer_sketch hparams."""
    hparams = transformer.transformer_small()
    hparams.num_compress_steps = 4
    hparams.batch_size = 32
    hparams.clip_grad_norm = 2.
    hparams.sampling_method = "random"
    return hparams
Ejemplo n.º 13
0
def transformer_sketch():
  """Basic transformer_sketch hparams."""
  hparams = transformer.transformer_small()
  hparams.num_compress_steps = 4
  hparams.batch_size = 32
  hparams.clip_grad_norm = 2.
  hparams.sampling_method = "random"
  return hparams
Ejemplo n.º 14
0
def transformer_vae_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate_warmup_steps = 16000
    hparams.add_hparam("z_size", 128)
    hparams.add_hparam("num_compress_steps", 4)
    hparams.add_hparam("kl_warmup_steps", 60000)
    return hparams
Ejemplo n.º 15
0
  def testBeamVsFast(self):
    model, features = self.getModel(transformer.transformer_small())

    decode_length = 2

    out_logits, _ = model.model_fn(features)
    out_logits = tf.squeeze(out_logits[0], axis=[2, 3])
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
        labels=tf.reshape(features["targets"], [-1]))
    loss = tf.reduce_mean(loss)
    apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)

    with self.test_session():
      tf.global_variables_initializer().run()
      for _ in range(100):
        apply_grad.run()

    model, _ = self.getModel(transformer.transformer_small(),
                             mode=tf.estimator.ModeKeys.PREDICT)

    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
      beam_result = model._beam_decode_slow(
          features,
          decode_length,
          beam_size=4,
          top_beams=1,
          last_position_only=True,
          alpha=1.0)

      fast_result = model._beam_decode(
          features,
          decode_length,
          beam_size=4,
          top_beams=1,
          last_position_only=True,
          alpha=1.0)

    with self.test_session():
      beam_res = beam_result.eval()
      fast_res = fast_result.eval()

    self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
    self.assertAllClose(beam_res, fast_res)
Ejemplo n.º 16
0
def imagetransformer_latent_tiny():
  """Tiny set of hparams for a latent image model."""
  hparams = transformer.transformer_small()
  hparams.batch_size = 2
  hparams.num_hidden_layers = 3
  hparams.hidden_size = 16
  hparams.filter_size = 32
  hparams.compress_filter_size = 64
  hparams.ffn_layer = "conv_hidden_relu"
  hparams.layer_prepostprocess_dropout = 0.2
  hparams.layer_preprocess_sequence = "none"
  hparams.layer_postprocess_sequence = "dan"
  hparams.dropout = 0.3
  hparams.pos = "timing"
  hparams.num_encoder_layers = 1
  hparams.num_decoder_layers = 2
  hparams.use_pad_remover = False
  hparams.add_hparam("logit_normalization", True)
  hparams.add_hparam("bottleneck_kind", "dvq")
  hparams.add_hparam("bottleneck_bits", 4)
  hparams.add_hparam("num_residuals", 1)
  hparams.add_hparam("use_gold_targets", False)
  hparams.add_hparam("do_compress_attend", False)
  hparams.add_hparam("do_decompress_attend", False)
  hparams.add_hparam("drop_inputs", False)
  hparams.add_hparam("num_compress_steps", 2)
  hparams.add_hparam("startup_steps", 10000)
  hparams.add_hparam("mask_startup_steps", 50000)
  hparams.add_hparam("latent_dropout", 0.0)
  hparams.add_hparam("decode_autoregressive", False)
  hparams.add_hparam("vq_beta", 0.25)
  hparams.add_hparam("vq_epsilon", 1e-5)
  hparams.add_hparam("vq_decay", 0.999)
  hparams.add_hparam("ema", False)
  hparams.add_hparam("soft_em", True)
  hparams.add_hparam("num_samples", 1)
  hparams.add_hparam("num_latent_layers", 2)
  hparams.add_hparam("num_res_layers", 2)
  hparams.add_hparam("res_kernel_size", 3)
  hparams.add_hparam("num_blocks", 1)
  hparams.add_hparam("reshape_method", "slice")
  hparams.add_hparam("shared_rel", False)
  hparams.add_hparam("block_size", 1)
  hparams.add_hparam("kernel_size", 3)
  hparams.add_hparam("img_len", 8)
  hparams.add_hparam("num_channels", 1)
  hparams.add_hparam("local_and_global_att", False)
  hparams.add_hparam("block_length", 32)
  hparams.add_hparam("block_width", 128)
  hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D)
  hparams.add_hparam("latent_attention_type", cia.AttentionType.GLOBAL)
  hparams.add_hparam("block_raster_scan", False)
  hparams.add_hparam("num_latents", 1)
  hparams.add_hparam("q_filter_width", 1)
  hparams.add_hparam("kv_filter_width", 1)
  return hparams
Ejemplo n.º 17
0
def imagetransformer_latent_tiny():
    """Tiny set of hparams for a latent image model."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2
    hparams.num_hidden_layers = 3
    hparams.hidden_size = 16
    hparams.filter_size = 32
    hparams.compress_filter_size = 64
    hparams.ffn_layer = "conv_hidden_relu"
    hparams.layer_prepostprocess_dropout = 0.2
    hparams.layer_preprocess_sequence = "none"
    hparams.layer_postprocess_sequence = "dan"
    hparams.dropout = 0.3
    hparams.pos = "timing"
    hparams.num_encoder_layers = 1
    hparams.num_decoder_layers = 2
    hparams.use_pad_remover = False
    hparams.add_hparam("logit_normalization", True)
    hparams.add_hparam("bottleneck_kind", "dvq")
    hparams.add_hparam("bottleneck_bits", 4)
    hparams.add_hparam("num_residuals", 1)
    hparams.add_hparam("use_gold_targets", False)
    hparams.add_hparam("do_compress_attend", False)
    hparams.add_hparam("do_decompress_attend", False)
    hparams.add_hparam("drop_inputs", False)
    hparams.add_hparam("num_compress_steps", 2)
    hparams.add_hparam("startup_steps", 10000)
    hparams.add_hparam("mask_startup_steps", 50000)
    hparams.add_hparam("latent_dropout", 0.0)
    hparams.add_hparam("decode_autoregressive", False)
    hparams.add_hparam("vq_beta", 0.25)
    hparams.add_hparam("vq_epsilon", 1e-5)
    hparams.add_hparam("vq_decay", 0.999)
    hparams.add_hparam("ema", False)
    hparams.add_hparam("soft_em", True)
    hparams.add_hparam("num_samples", 1)
    hparams.add_hparam("num_latent_layers", 2)
    hparams.add_hparam("num_res_layers", 2)
    hparams.add_hparam("res_kernel_size", 3)
    hparams.add_hparam("num_blocks", 1)
    hparams.add_hparam("reshape_method", "slice")
    hparams.add_hparam("shared_rel", False)
    hparams.add_hparam("block_size", 1)
    hparams.add_hparam("kernel_size", 3)
    hparams.add_hparam("img_len", 8)
    hparams.add_hparam("num_channels", 1)
    hparams.add_hparam("local_and_global_att", False)
    hparams.add_hparam("block_length", 32)
    hparams.add_hparam("block_width", 128)
    hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D)
    hparams.add_hparam("latent_attention_type", cia.AttentionType.GLOBAL)
    hparams.add_hparam("block_raster_scan", False)
    hparams.add_hparam("num_latents", 1)
    hparams.add_hparam("q_filter_width", 1)
    hparams.add_hparam("kv_filter_width", 1)
    return hparams
Ejemplo n.º 18
0
 def testTransformer(self, get_model_fn=None, p=None):
   if get_model_fn:
     model, features = get_model_fn(param_overrides=p)
   else:
     model, features = get_model(transformer.transformer_small())
   logits, _ = model(features)
   with self.test_session() as session:
     session.run(tf.global_variables_initializer())
     res = session.run(logits)
   self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
Ejemplo n.º 19
0
 def _get_encoder_hparams(self):
     hparams = transformer.transformer_small()
     hparams.add_hparam("encoder_layer_list",
                        layers.ENCODER_LAYERS.get_layer_names())
     hparams.add_hparam("encoder_output_dim_list", [32] + [64] *
                        (len(hparams.encoder_layer_list) - 2) + [32])
     hparams.add_hparam("encoder_activation_list", ["none"] + ["relu"] *
                        (len(hparams.encoder_layer_list) - 1))
     hparams.add_hparam("encoder_norm_list", ["none"] + ["layer_norm"] *
                        (len(hparams.encoder_layer_list) - 1))
     return hparams
Ejemplo n.º 20
0
 def testTransformerWithEncoderDecoderAttentionLoss(self):
     model, features = self.getModel(transformer.transformer_small())
     expected_attention_weights = np.random.random_sample(
         size=(BATCH_SIZE, TARGET_LENGTH, INPUT_LENGTH))
     features["expected_attention_weights"] = tf.constant(
         expected_attention_weights, dtype=tf.float32)
     _, extra_loss = model(features)
     with self.test_session() as session:
         session.run(tf.global_variables_initializer())
         res = session.run(extra_loss["attention_loss"])
     self.assertEqual(res.shape, ())
Ejemplo n.º 21
0
def transformer_small_sketch():
  """Modified transformer_small."""
  hparams = transformer_small()
  hparams.batch_size = 2048
  hparams.max_length = 784
  hparams.clip_grad_norm = 5.
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.initializer = "orthogonal"
  hparams.sampling_method = "random"
  hparams.learning_rate_warmup_steps = 10000
  return hparams
Ejemplo n.º 22
0
def transformer_small_sketch():
    """Modified transformer_small."""
    hparams = transformer_small()
    hparams.batch_size = 2048
    hparams.max_length = 784
    hparams.clip_grad_norm = 5.
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.initializer = "orthogonal"
    hparams.sampling_method = "random"
    hparams.learning_rate_warmup_steps = 10000
    return hparams
def transformer_ae_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate_warmup_steps = 4000
    hparams.add_hparam("z_size", 128)
    hparams.add_hparam("v_size", 1024 * 32)
    hparams.add_hparam("num_compress_steps", 4)
    hparams.add_hparam("kl_warmup_steps", 60000)
    hparams.add_hparam("startup_steps", 30000)
    hparams.add_hparam("kmeans_lr_factor", 0.002)
    hparams.add_hparam("z_dropout", 0.1)
    return hparams
Ejemplo n.º 24
0
def transformer_ae_small():
  """Set of hyperparameters."""
  hparams = transformer.transformer_small()
  hparams.batch_size = 2048
  hparams.learning_rate_warmup_steps = 4000
  hparams.num_hidden_layers = 3
  hparams.hidden_size = 384
  hparams.filter_size = 2048
  hparams.label_smoothing = 0.0
  hparams.optimizer = "Adafactor"
  hparams.add_hparam("z_size", 16)
  hparams.add_hparam("noise_dev", 0.0)
  hparams.add_hparam("d_mix", 0.5)
  # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae.
  hparams.add_hparam("bottleneck_kind", "semhash")
  hparams.add_hparam("num_blocks", 1)
  # Reshape method for hierarchical vq-vae: slice, project
  hparams.add_hparam("reshape_method", "slice")
  hparams.add_hparam("trainable_projections", False)
  hparams.add_hparam("unmasked_percentage", 0.3)
  hparams.add_hparam("do_ae", True)
  hparams.add_hparam("do_mask", True)
  hparams.add_hparam("do_refine", False)
  hparams.add_hparam("do_attend_compress", False)
  hparams.add_hparam("do_attend_decompress", True)
  hparams.add_hparam("do_residual_compress", False)
  hparams.add_hparam("drop_inputs", False)
  hparams.add_hparam("v_size", 1024*64)
  hparams.add_hparam("max_context_length", 64)
  hparams.add_hparam("num_compress_steps", 3)
  hparams.add_hparam("kl_steps", 35000)
  hparams.add_hparam("startup_steps", 10000)
  hparams.add_hparam("mask_startup_steps", 50000)
  hparams.add_hparam("kmeans_lr_factor", 0.002)
  hparams.add_hparam("z_dropout", 0.1)
  hparams.add_hparam("is_2d", 0)
  hparams.add_hparam("use_gumbel_softmax", True)
  hparams.add_hparam("softmax_k", 0)
  hparams.add_hparam("decode_autoregressive", True)
  hparams.add_hparam("do_vae", True)
  hparams.add_hparam("bit_vae", True)
  hparams.add_hparam("beta", 0.25)
  hparams.add_hparam("epsilon", 1e-5)
  hparams.add_hparam("decay", 0.999)
  hparams.add_hparam("ema", True)
  hparams.add_hparam("random_top_k", 1)
  hparams.kl_warmup_steps = 150000
  hparams.force_full_predict = True
  return hparams
Ejemplo n.º 25
0
def transformer_adv_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate_warmup_steps = 4000
    hparams.num_hidden_layers = 3
    hparams.hidden_size = 384
    hparams.filter_size = 2048
    hparams.label_smoothing = 0.0
    hparams.weight_decay = 0.1
    hparams.symbol_modality_skip_top = int(True)
    hparams.add_hparam("num_compress_steps", 2)
    hparams.add_hparam("extra_steps", 0)
    hparams.add_hparam("noise_val", 0.3)
    hparams.add_hparam("delta_max", 2.0)
    return hparams
Ejemplo n.º 26
0
def build_model():
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)
    model = getTransformerModel(transformer.transformer_small())

    with tf.name_scope("input"):
        input = tf.placeholder(tf.float32,
                               [batch_size, num_steps, num_features],
                               name="input")
        target = tf.placeholder(tf.int32, [batch_size, num_steps],
                                name="target")
        input2 = tf.reshape(input, [batch_size, num_steps, num_features, 1])
        target2 = tf.reshape(target, [batch_size, num_steps, 1, 1])

    features = {
        "inputs": input2,
        "targets": target2,
        "target_space_id": tf.constant(1, dtype=tf.int32)
    }

    out_logits, _ = model(features)
    out_logits = tf.squeeze(out_logits, axis=[2, 3])

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=tf.reshape(out_logits, [-1, num_classes]),
        labels=tf.reshape(features["targets"], [-1]))
    loss = tf.reduce_mean(loss)

    last_predicted = tf.split(tf.cast(tf.argmax(out_logits, 2), tf.int32),
                              num_steps, 1)[-1]
    last_target = tf.split(target, num_steps, 1)[-1]

    confusion_mat = tf.confusion_matrix(tf.reshape(last_target, [batch_size]),
                                        tf.reshape(last_predicted,
                                                   [batch_size]),
                                        num_classes=num_classes,
                                        name='batch_confusion')

    acc = tf.reduce_mean(
        tf.cast(tf.equal(last_predicted, last_target), tf.float32))
    grad_op = tf.train.AdamOptimizer().minimize(loss, global_step=global_step)

    loss_summary = tf.summary.scalar('cross_entropy', loss)
    acc_summary = tf.summary.scalar('accuracy', acc)
    summary_op = tf.summary.merge_all()
    return input, target, loss, acc, grad_op, summary_op, global_step, confusion_mat
def transformer_ae_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate_warmup_steps = 4000
    hparams.add_hparam("z_size", 128)
    hparams.add_hparam("v_size", 1024 * 32)
    hparams.add_hparam("num_compress_steps", 4)
    hparams.add_hparam("kl_warmup_steps", 60000)
    hparams.add_hparam("startup_steps", 30000)
    hparams.add_hparam("kmeans_lr_factor", 0.002)
    hparams.add_hparam("z_dropout", 0.1)
    hparams.add_hparam("is_2d", 0)
    hparams.add_hparam("use_gumbel_softmax", int(True))
    hparams.add_hparam("softmax_k", 4)
    hparams.add_hparam("decode_autoregressive", int(True))
    return hparams
Ejemplo n.º 28
0
    def getModel(self):
        hparams = transformer.transformer_small()
        p_hparams = problem_hparams.test_problem_hparams(
            hparams, VOCAB_SIZE, VOCAB_SIZE)
        hparams.problems = [p_hparams]
        inputs = -1 + np.random.random_integers(
            VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1))
        targets = -1 + np.random.random_integers(
            VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1))
        features = {
            "inputs": tf.constant(inputs, dtype=tf.int32),
            "targets": tf.constant(targets, dtype=tf.int32),
            "target_space_id": tf.constant(1, dtype=tf.int32),
        }

        return transformer.Transformer(hparams, tf.estimator.ModeKeys.PREDICT,
                                       p_hparams), features
Ejemplo n.º 29
0
    def testGreedySlowTPUVsNonTPU(self):
        # Only works with TF 1.8+
        # Version string can take the following form: "1.9.0-rc0"
        major_str, minor_str, unused_rest = tf.__version__.split(".", 3)
        major, minor = int(major_str), int(minor_str)
        if major < 1 or (major == 1 and minor < 8):
            return
        model, features = get_model(transformer.transformer_small())

        decode_length = 3

        out_logits, _ = model(features)
        out_logits = tf.squeeze(out_logits, axis=[2, 3])
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
            labels=tf.reshape(features["targets"], [-1]))
        loss = tf.reduce_mean(loss)
        apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)

        with self.test_session():
            tf.global_variables_initializer().run()
            for _ in range(100):
                apply_grad.run()

        model.set_mode(tf.estimator.ModeKeys.PREDICT)

        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            slow_result_non_tpu = model._slow_greedy_infer(
                features, decode_length)["outputs"]
            slow_result_non_tpu = tf.squeeze(slow_result_non_tpu, axis=[2, 3])

            slow_result_tpu = model._slow_greedy_infer_tpu(
                features, decode_length)["outputs"]
            slow_result_tpu = tf.squeeze(slow_result_tpu, axis=[2, 3])

        with self.test_session():
            slow_non_tpu_res = slow_result_non_tpu.eval()
            slow_tpu_res = slow_result_tpu.eval()

        self.assertEqual(slow_tpu_res.shape,
                         (BATCH_SIZE, INPUT_LENGTH + decode_length))
        self.assertAllClose(slow_tpu_res, slow_non_tpu_res)
Ejemplo n.º 30
0
  def testBeamVsFast(self):
    model, features = get_model(transformer.transformer_small())

    decode_length = 2

    out_logits, _ = model(features)
    out_logits = tf.squeeze(out_logits, axis=[2, 3])
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
        labels=tf.reshape(features["targets"], [-1]))
    loss = tf.reduce_mean(loss)
    apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)

    with self.test_session():
      tf.global_variables_initializer().run()
      for _ in range(100):
        apply_grad.run()

    model.set_mode(tf.estimator.ModeKeys.PREDICT)

    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
      beam_result = model._beam_decode_slow(
          features,
          decode_length,
          beam_size=4,
          top_beams=1,
          alpha=1.0)["outputs"]

      fast_result = model._beam_decode(
          features,
          decode_length,
          beam_size=4,
          top_beams=1,
          alpha=1.0)["outputs"]

    with self.test_session():
      beam_res = beam_result.eval()
      fast_res = fast_result.eval()

    self.assertAllClose(beam_res, fast_res)
Ejemplo n.º 31
0
def transformer_l2_arctic():
    """HParams for training ASR model on L2 Arctic"""
    hparams = transformer_small()

    hparams.max_length = 1240000
    hparams.max_input_seq_length = 1550
    hparams.max_target_seq_length = 350
    hparams.batch_size = 16
    hparams.learning_rate = 0.15
    hparams.daisy_chain_variables = False
    hparams.num_heads = 2
    hparams.ffn_layer = "conv_relu_conv"
    hparams.conv_first_kernel = 9
    hparams.weight_decay = 0
    hparams.layer_prepostprocess_dropout = 0.2
    hparams.relu_dropout = 0.2

    hparams.num_decoder_layers = 1
    hparams.num_encoder_layers = 3
    # hparams.num_hidden_layers = 1
    # hparams.hidden_size = 256

    return hparams
Ejemplo n.º 32
0
    def getModel(self, mode=tf.estimator.ModeKeys.TRAIN):
        hparams = transformer.transformer_small()
        hparams.hidden_size = 8
        hparams.filter_size = 32
        hparams.num_heads = 1
        hparams.layer_prepostprocess_dropout = 0.0

        p_hparams = problem_hparams.test_problem_hparams(
            VOCAB_SIZE, VOCAB_SIZE)
        hparams.problems = [p_hparams]

        inputs = -1 + np.random.random_integers(
            VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1))
        targets = -1 + np.random.random_integers(
            VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1))
        features = {
            "inputs": tf.constant(inputs, dtype=tf.int32),
            "targets": tf.constant(targets, dtype=tf.int32),
            "target_space_id": tf.constant(1, dtype=tf.int32),
        }

        return transformer.Transformer(hparams, tf.estimator.ModeKeys.PREDICT,
                                       p_hparams), features
Ejemplo n.º 33
0
 def get_hparams(self):
     hparams = transformer.transformer_small()
     hparams.add_hparam("prior_type", "affine")
     hparams.add_hparam("depths", "12")  # infer n_levels from depths
     hparams.add_hparam("split_plans", "tca")
     hparams.add_hparam("factor", 2)  # squeezing factor
     hparams.add_hparam("n_layers_transform_params", 1)
     hparams.add_hparam("n_layers_multiscale_prior", 3)
     hparams.add_hparam("flow_num_heads", 4)
     hparams.add_hparam("flow_num_1x1_heads", N_1X1_HEADS)
     hparams.add_hparam("flow_hidden_size", 64)
     hparams.add_hparam("flow_filter_size", 128)
     hparams.add_hparam("cond_prior_on_src", True)
     hparams.add_hparam("bottom_prior_std", False)
     hparams.add_hparam("latent_size", N_CHANNELS)
     hparams.add_hparam("scale_width", 0.999)
     hparams.add_hparam("coupling_transform_ratio", 0.5)
     hparams.add_hparam("actnorm_type", "actnorm")
     hparams.add_hparam("actnorm_weightnorm", True)
     hparams.add_hparam("perm_type", "1x1")
     hparams.add_hparam("init_permutation", True)
     hparams.causal_decoder_self_attention = False
     hparams.hidden_size = HIDDEN_SIZE
     return hparams
Ejemplo n.º 34
0
def transformer_nat_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate = 0.2
    hparams.learning_rate_warmup_steps = 4000
    hparams.num_hidden_layers = 3
    hparams.hidden_size = 384
    hparams.filter_size = 2048
    hparams.label_smoothing = 0.0
    hparams.force_full_predict = True
    hparams.optimizer = "adam"
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("bottleneck_kind", "vq")
    hparams.add_hparam("bottleneck_bits", 12)
    hparams.add_hparam("num_compress_steps", 3)
    hparams.add_hparam("beta", 0.25)
    hparams.add_hparam("epsilon", 1e-5)
    hparams.add_hparam("decay", 0.999)
    hparams.add_hparam("num_samples", 10)
    hparams.add_hparam("mask_startup_steps", 50000)
    return hparams
Ejemplo n.º 35
0
def transformer_nat_small():
  """Set of hyperparameters."""
  hparams = transformer.transformer_small()
  hparams.batch_size = 2048
  hparams.learning_rate = 0.2
  hparams.learning_rate_warmup_steps = 4000
  hparams.num_hidden_layers = 3
  hparams.hidden_size = 384
  hparams.filter_size = 2048
  hparams.label_smoothing = 0.0
  hparams.force_full_predict = True
  hparams.optimizer = "Adam"
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.997
  hparams.add_hparam("bottleneck_kind", "vq")
  hparams.add_hparam("bottleneck_bits", 12)
  hparams.add_hparam("num_compress_steps", 3)
  hparams.add_hparam("beta", 0.25)
  hparams.add_hparam("epsilon", 1e-5)
  hparams.add_hparam("decay", 0.999)
  hparams.add_hparam("num_samples", 10)
  hparams.add_hparam("mask_startup_steps", 50000)
  return hparams
Ejemplo n.º 36
0
def transformer_nat_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate = 0.2
    hparams.learning_rate_warmup_steps = 4000
    hparams.num_hidden_layers = 3
    hparams.hidden_size = 384
    hparams.filter_size = 2048
    hparams.label_smoothing = 0.0
    hparams.optimizer = "Adam"  # Can be unstable, maybe try Adam.
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.997  # Needs tuning, try 0.98 to 0.999.
    hparams.add_hparam("bottleneck_kind", "em")
    hparams.add_hparam("bottleneck_bits", 12)
    hparams.add_hparam("num_compress_steps", 3)
    hparams.add_hparam("startup_steps", 10000)
    hparams.add_hparam("mask_startup_steps", 50000)
    hparams.add_hparam("beta", 0.25)
    hparams.add_hparam("epsilon", 1e-5)
    hparams.add_hparam("decay", 0.999)
    hparams.add_hparam("num_samples", 10)
    return hparams
Ejemplo n.º 37
0
def transformer_ae_small():
    """Set of hyperparameters."""
    hparams = transformer.transformer_small()
    hparams.batch_size = 2048
    hparams.learning_rate = 0.2
    hparams.learning_rate_warmup_steps = 4000
    hparams.num_hidden_layers = 3
    hparams.hidden_size = 384
    hparams.filter_size = 2048
    hparams.add_hparam("compress_filter_size", 2048 * 2)
    hparams.label_smoothing = 0.0
    hparams.optimizer = "Adam"  # Can be unstable, maybe try Adam.
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.997  # Needs tuning, try 0.98 to 0.999.
    hparams.add_hparam("z_size", 14)
    hparams.add_hparam("noise_dev", 0.5)
    hparams.add_hparam("d_mix", 0.5)
    hparams.add_hparam("logit_normalization", True)
    hparams.add_hparam("word_dropout", 0.0)
    # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq.
    hparams.add_hparam("bottleneck_kind", "semhash")
    hparams.add_hparam("num_blocks", 1)
    hparams.add_hparam("num_decode_blocks", 1)
    # Add an hparam for number of reiduals
    hparams.add_hparam("num_residuals", 1)
    # Reshape method for DVQ: slice, project
    hparams.add_hparam("causal", True)
    hparams.add_hparam("reshape_method", "slice")
    hparams.add_hparam("trainable_projections", False)
    hparams.add_hparam("unmasked_percentage", 0.1)
    hparams.add_hparam("do_ae", True)
    hparams.add_hparam("do_mask", True)
    hparams.add_hparam("use_predict_mask", True)
    hparams.add_hparam("do_refine", False)
    hparams.add_hparam("do_attend_compress", False)
    hparams.add_hparam("do_attend_decompress", True)
    hparams.add_hparam("do_residual_compress", False)
    hparams.add_hparam("drop_inputs", False)
    hparams.add_hparam("v_size", 1024 * 64)
    hparams.add_hparam("max_context_length", 64)
    hparams.add_hparam("num_compress_steps", 3)
    hparams.add_hparam("startup_steps", 10000)
    hparams.add_hparam("mask_startup_steps", 50000)
    hparams.add_hparam("z_dropout", 0.1)
    hparams.add_hparam("is_2d", 0)
    hparams.add_hparam("softmax_k", 0)
    hparams.add_hparam("decode_autoregressive", True)
    hparams.add_hparam("do_vae", True)
    hparams.add_hparam("bit_vae", True)
    hparams.add_hparam("beta", 0.25)
    hparams.add_hparam("epsilon", 1e-5)
    hparams.add_hparam("decay", 0.999)
    hparams.add_hparam("ema", True)
    hparams.add_hparam("random_top_k", 1)
    hparams.add_hparam("soft_em", False)
    hparams.add_hparam("num_samples", 10)
    hparams.add_hparam("inv_temp", 1.0)
    hparams.add_hparam("entropy_scale", 0.0)
    hparams.add_hparam("prior_scale", 1.0)
    hparams.add_hparam("do_hard_gumbel_softmax", False)
    hparams.add_hparam("do_iaf", False)
    hparams.add_hparam("approximate_gs_entropy", False)
    hparams.add_hparam("temperature_warmup_steps", 150000)
    hparams.add_hparam("sum_over_latents", False)
    hparams.force_full_predict = True

    # task params
    hparams.add_hparam("task",
                       "translate")  # translate or image tasks supported
    return hparams
Ejemplo n.º 38
0
    def test_calculate_branching_model_parameters_transformer(
            self, get_config, expected_hidden_depths):
        tf.reset_default_graph()

        (num_cells, left_inputs, left_layers, left_output_dims, right_inputs,
         right_layers, right_output_dims, combiner_functions,
         final_combiner_function, dummy_activations, dummy_norms,
         layer_registry, is_decoder) = get_config()

        # Get predicted number of parameters.
        (predicted_num_params, output_size, hidden_depths,
         _) = translation_nas_net.calculate_branching_model_parameters(
             encoding_depth=_EMBEDDING_DEPTH,
             left_inputs=left_inputs,
             left_layers=left_layers,
             left_output_dims=left_output_dims,
             right_inputs=right_inputs,
             right_layers=right_layers,
             right_output_dims=right_output_dims,
             combiner_functions=combiner_functions,
             final_combiner_function=final_combiner_function,
             layer_registry=layer_registry,
             num_cells=num_cells,
             encoder_depth=_EMBEDDING_DEPTH)

        # Create model graph.
        input_tensor = tf.zeros([32, _INPUT_LENGTH, _EMBEDDING_DEPTH])
        hparams = transformer.transformer_small()

        if is_decoder:
            nonpadding = None
            mask_future = True
            decoder_self_attention_bias = (
                common_attention.attention_bias_lower_triangle(_INPUT_LENGTH))
            encoder_cell_outputs = [input_tensor] * 6
        else:
            nonpadding = tf.ones([32, _INPUT_LENGTH])
            mask_future = False
            decoder_self_attention_bias = None
            encoder_cell_outputs = None

        translation_nas_net.apply_nas_layers(
            input_tensor=input_tensor,
            left_inputs=left_inputs,
            left_layers=left_layers,
            left_activations=dummy_activations,
            left_output_dims=left_output_dims,
            left_norms=dummy_norms,
            right_inputs=right_inputs,
            right_layers=right_layers,
            right_activations=dummy_activations,
            right_output_dims=right_output_dims,
            right_norms=dummy_norms,
            combiner_functions=combiner_functions,
            final_combiner_function=final_combiner_function,
            num_cells=num_cells,
            nonpadding=nonpadding,
            layer_registry=layer_registry,
            mask_future=mask_future,
            hparams=hparams,
            var_scope="test",
            encoder_decoder_attention_bias=None,
            encoder_cell_outputs=encoder_cell_outputs,
            decoder_self_attention_bias=decoder_self_attention_bias,
            final_layer_norm=False)

        # Count graph variables.
        trainable_variables_list = tf.trainable_variables()
        empirical_num_params = 0
        for variable_tensor in trainable_variables_list:
            empirical_num_params += _list_product(
                variable_tensor.shape.as_list())

        # Compare.
        self.assertEqual(empirical_num_params, predicted_num_params)
        self.assertEqual(output_size, _EMBEDDING_DEPTH)
        self.assertEqual(hidden_depths, expected_hidden_depths)
    def build_model(self):
        # build index table
        index_table = tf.contrib.lookup.index_table_from_file(
            vocabulary_file=self.config.vocab_list,
            num_oov_buckets=0,
            default_value=0)

        # get data iterator
        self.data_iterator = self.data.get_data_iterator(index_table,
                                                         mode=self.mode)

        # get inputs
        with tf.variable_scope("inputs"):
            # get next batch if there is no feeded data
            next_batch = self.data_iterator.get_next()
            self.input_queries = tf.placeholder_with_default(
                next_batch["input_queries"], [None, self.config.max_length],
                name="input_queries")
            self.input_replies = tf.placeholder_with_default(
                next_batch["input_replies"], [None, self.config.max_length],
                name="input_replies")
            self.query_lengths = tf.placeholder_with_default(
                tf.squeeze(next_batch["query_lengths"]), [None],
                name="query_lengths")
            self.reply_lengths = tf.placeholder_with_default(
                tf.squeeze(next_batch["reply_lengths"]), [None],
                name="reply_lengths")

            # get hyperparams
            self.embed_dropout_keep_prob = tf.placeholder(
                tf.float64, name="embed_dropout_keep_prob")
            self.lstm_dropout_keep_prob = tf.placeholder(
                tf.float32, name="lstm_dropout_keep_prob")
            self.dense_dropout_keep_prob = tf.placeholder(
                tf.float32, name="dense_dropout_keep_prob")
            self.num_negative_samples = tf.placeholder(
                tf.int32, name="num_negative_samples")

        with tf.variable_scope("properties"):
            # length properties
            cur_batch_length = tf.shape(self.input_queries)[0]

            # get hparms from tensor2tensor.models.transformer
            hparams = transformer.transformer_small()
            hparams.batch_size = self.config.batch_size
            hparams.learning_rate_decay_steps = 10000
            hparams.learning_rate_minimum = 3e-5

            # learning rate
            lr = learning_rate.learning_rate_schedule(hparams)
            self.learning_rate = lr

        # embedding layer
        with tf.variable_scope("embedding"):
            embeddings = tf.Variable(get_embeddings(
                self.config.vocab_list, self.config.pretrained_embed_dir,
                self.config.vocab_size, self.config.embed_dim),
                                     trainable=True,
                                     name="embeddings")
            embeddings = tf.nn.dropout(
                embeddings,
                keep_prob=self.embed_dropout_keep_prob,
                noise_shape=[tf.shape(embeddings)[0], 1])
            queries_embedded = tf.to_float(
                tf.nn.embedding_lookup(embeddings,
                                       self.input_queries,
                                       name="queries_embedded"))
            replies_embedded = tf.to_float(
                tf.nn.embedding_lookup(embeddings,
                                       self.input_replies,
                                       name="replies_embedded"))

            self.queries_embedded = queries_embedded
            self.replies_embedded = replies_embedded

        # transformer layer
        with tf.variable_scope("transformer"):
            queries_expanded = tf.expand_dims(queries_embedded,
                                              axis=2,
                                              name="queries_expanded")
            replies_expanded = tf.expand_dims(replies_embedded,
                                              axis=2,
                                              name="replies_expanded")

            hparams = transformer.transformer_small()
            hparams.set_hparam("batch_size", self.config.batch_size)
            hparams.set_hparam("hidden_size", self.config.embed_dim)
            encoder = transformer.TransformerEncoder(hparams, mode=self.mode)

            self.queries_encoded = encoder({
                "inputs": queries_expanded,
                "targets": queries_expanded
            })[0]
            self.replies_encoded = encoder({
                "inputs": replies_expanded,
                "targets": replies_expanded
            })[0]

            self.queries_encoded = tf.squeeze(
                tf.reduce_sum(self.queries_encoded, axis=1, keep_dims=True))
            self.replies_encoded = tf.squeeze(
                tf.reduce_sum(self.replies_encoded, axis=1, keep_dims=True))

        with tf.variable_scope("sampling"):
            positive_mask = tf.eye(cur_batch_length)
            negative_mask = make_negative_mask(
                tf.zeros([cur_batch_length, cur_batch_length]),
                method=self.config.negative_sampling,
                num_negative_samples=self.num_negative_samples)
            negative_queries_indices, negative_replies_indices = tf.split(
                tf.where(tf.not_equal(negative_mask, 0)), [1, 1], 1)

            self.distances = tf.matmul(self.queries_encoded,
                                       self.replies_encoded,
                                       transpose_b=True)
            self.distances_flattened = tf.reshape(self.distances, [-1])

            self.positive_distances = tf.gather(
                self.distances_flattened,
                tf.where(tf.reshape(positive_mask, [-1])))
            self.negative_distances = tf.gather(
                self.distances_flattened,
                tf.where(tf.reshape(negative_mask, [-1])))

            self.negative_queries_indices = tf.squeeze(
                negative_queries_indices)
            self.negative_replies_indices = tf.squeeze(
                negative_replies_indices)

            self.positive_inputs = tf.concat([
                self.queries_encoded, self.positive_distances,
                self.replies_encoded
            ], 1)
            self.negative_inputs = tf.reshape(
                tf.concat([
                    tf.nn.embedding_lookup(self.queries_encoded,
                                           self.negative_queries_indices),
                    self.negative_distances,
                    tf.nn.embedding_lookup(self.replies_encoded,
                                           self.negative_replies_indices)
                ], 1), [
                    tf.shape(negative_queries_indices)[0],
                    self.config.embed_dim * 2 + 1
                ])

        with tf.variable_scope("prediction"):
            self.hidden_outputs = tf.layers.dense(tf.concat(
                [self.positive_inputs, self.negative_inputs], 0),
                                                  256,
                                                  tf.nn.relu,
                                                  name="hidden_layer")
            self.logits = tf.layers.dense(self.hidden_outputs,
                                          2,
                                          tf.nn.relu,
                                          name="output_layer")
            labels = tf.concat([
                tf.ones([tf.shape(self.positive_inputs)[0]], tf.float64),
                tf.zeros([tf.shape(self.negative_inputs)[0]], tf.float64)
            ], 0)

            self.labels = tf.one_hot(tf.to_int32(labels), 2)

            self.probs = tf.sigmoid(self.logits)
            self.predictions = tf.argmax(self.probs, 1)

        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.labels,
                                                           logits=self.logits))
            self.train_step = optimize.optimize(self.loss,
                                                lr,
                                                hparams,
                                                use_tpu=False)

        with tf.variable_scope("score"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.labels, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   "float"),
                                           name="accuracy")
Ejemplo n.º 40
0
def transformer_ae_small():
  """Set of hyperparameters."""
  hparams = transformer.transformer_small()
  hparams.batch_size = 2048
  hparams.learning_rate = 0.2
  hparams.learning_rate_warmup_steps = 4000
  hparams.num_hidden_layers = 3
  hparams.hidden_size = 384
  hparams.filter_size = 2048
  hparams.add_hparam("compress_filter_size", 2048 * 2)
  hparams.label_smoothing = 0.0
  hparams.optimizer = "Adam"  # Can be unstable, maybe try Adam.
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.997  # Needs tuning, try 0.98 to 0.999.
  hparams.add_hparam("z_size", 14)
  hparams.add_hparam("noise_dev", 0.5)
  hparams.add_hparam("d_mix", 0.5)
  hparams.add_hparam("logit_normalization", True)
  hparams.add_hparam("word_dropout", 0.1)
  # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq.
  hparams.add_hparam("bottleneck_kind", "semhash")
  hparams.add_hparam("num_blocks", 1)
  hparams.add_hparam("num_decode_blocks", 1)
  # Add an hparam for number of reiduals
  hparams.add_hparam("num_residuals", 1)
  # Reshape method for DVQ: slice, project
  hparams.add_hparam("causal", True)
  hparams.add_hparam("reshape_method", "slice")
  hparams.add_hparam("trainable_projections", False)
  hparams.add_hparam("unmasked_percentage", 0.1)
  hparams.add_hparam("do_ae", True)
  hparams.add_hparam("do_mask", True)
  hparams.add_hparam("use_predict_mask", True)
  hparams.add_hparam("do_refine", False)
  hparams.add_hparam("do_attend_compress", False)
  hparams.add_hparam("do_attend_decompress", True)
  hparams.add_hparam("do_residual_compress", False)
  hparams.add_hparam("drop_inputs", False)
  hparams.add_hparam("v_size", 1024*64)
  hparams.add_hparam("max_context_length", 64)
  hparams.add_hparam("num_compress_steps", 3)
  hparams.add_hparam("startup_steps", 10000)
  hparams.add_hparam("mask_startup_steps", 50000)
  hparams.add_hparam("z_dropout", 0.1)
  hparams.add_hparam("is_2d", 0)
  hparams.add_hparam("softmax_k", 0)
  hparams.add_hparam("decode_autoregressive", True)
  hparams.add_hparam("do_vae", True)
  hparams.add_hparam("bit_vae", True)
  hparams.add_hparam("beta", 0.25)
  hparams.add_hparam("epsilon", 1e-5)
  hparams.add_hparam("decay", 0.999)
  hparams.add_hparam("ema", True)
  hparams.add_hparam("random_top_k", 1)
  hparams.add_hparam("soft_em", False)
  hparams.add_hparam("num_samples", 10)
  hparams.add_hparam("inv_temp", 1.0)
  hparams.kl_warmup_steps = 150000
  hparams.force_full_predict = True

  # task params
  hparams.add_hparam("task", "translate")  # translate or image tasks supported
  return hparams