Ejemplo n.º 1
0
def shakeshake_cifar10():
  hparams = common_hparams.basic_params1()
  # This leads to effective batch size 128 when number of GPUs is 1
  hparams.batch_size = 4096 * 8
  hparams.hidden_size = 16
  hparams.dropout = 0
  hparams.label_smoothing = 0.0
  hparams.clip_grad_norm = 2.0
  hparams.num_hidden_layers = 26
  hparams.kernel_height = -1  # Unused
  hparams.kernel_width = -1  # Unused
  hparams.learning_rate_decay_scheme = "cosine"
  # Model should be run for 700000 steps with batch size 128 (~1800 epochs)
  hparams.learning_rate_cosine_cycle_steps = 700000
  hparams.learning_rate = 0.2
  hparams.learning_rate_warmup_steps = 3000
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  # TODO(rshin): Adjust so that effective value becomes ~1e-4
  hparams.weight_decay = 3.0
  hparams.optimizer = "Momentum"
  hparams.optimizer_momentum_momentum = 0.9
  hparams.add_hparam('base_filters', 16)
  hparams.add_hparam('shakeshake_type', 'batch')
  return hparams
 def testNeuralGPU(self):
   hparams = common_hparams.basic_params1()
   batch_size = 3
   input_length = 5
   target_length = input_length
   input_vocab_size = 9
   target_vocab_size = 11
   p_hparams = problem_hparams.test_problem_hparams(hparams, input_vocab_size,
                                                    target_vocab_size)
   inputs = -1 + np.random.random_integers(
       input_vocab_size, size=(batch_size, input_length, 1, 1))
   targets = -1 + np.random.random_integers(
       target_vocab_size, size=(batch_size, target_length, 1, 1))
   with self.test_session() as session:
     features = {
         "inputs": tf.constant(inputs, dtype=tf.int32),
         "targets": tf.constant(targets, dtype=tf.int32)
     }
     model = neural_gpu.NeuralGPU(hparams, p_hparams)
     shadred_logits, _, _ = model.model_fn(features, True)
     logits = tf.concat(shadred_logits, 0)
     session.run(tf.global_variables_initializer())
     res = session.run(logits)
   self.assertEqual(res.shape, (batch_size, target_length, 1, 1,
                                target_vocab_size))
Ejemplo n.º 3
0
def attention_lm_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 1024
  hparams.batch_size = 8192
  hparams.max_length = 256
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 1.0
  hparams.learning_rate_warmup_steps = 1000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.1
  hparams.shared_embedding_and_softmax_weights = int(False)

  hparams.add_hparam("filter_size", 4096)  # Add new ones like this.
  # attention-related flags
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("residual_dropout", 0.1)
  return hparams
Ejemplo n.º 4
0
def bluenet_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 4096
    hparams.hidden_size = 256
    hparams.dropout = 0.2
    hparams.symbol_dropout = 0.5
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 8
    hparams.kernel_height = 3
    hparams.kernel_width = 3
    hparams.learning_rate_decay_scheme = "exp10k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("imagenet_use_2d", True)
    hparams.add_hparam("anneal_until", 40000)
    hparams.add_hparam("batch_deviation_loss_factor", 5.0)
    return hparams
Ejemplo n.º 5
0
 def testNeuralGPU(self):
   hparams = common_hparams.basic_params1()
   batch_size = 3
   input_length = 5
   target_length = input_length
   input_vocab_size = 9
   target_vocab_size = 11
   p_hparams = problem_hparams.test_problem_hparams(hparams, input_vocab_size,
                                                    target_vocab_size)
   inputs = -1 + np.random.random_integers(
       input_vocab_size, size=(batch_size, input_length, 1, 1))
   targets = -1 + np.random.random_integers(
       target_vocab_size, size=(batch_size, target_length, 1, 1))
   with self.test_session() as session:
     features = {
         "inputs": tf.constant(inputs, dtype=tf.int32),
         "targets": tf.constant(targets, dtype=tf.int32)
     }
     model = neural_gpu.NeuralGPU(
         hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
     shadred_logits, _, _ = model.model_fn(features)
     logits = tf.concat(shadred_logits, 0)
     session.run(tf.global_variables_initializer())
     res = session.run(logits)
   self.assertEqual(res.shape, (batch_size, target_length, 1, 1,
                                target_vocab_size))
Ejemplo n.º 6
0
def long_answer_base():
    """Set of hyperparameters.

  Returns:
    a hparams object
  """
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 1024
    hparams.batch_size = 8192
    hparams.max_length = 8192
    hparams.dropout = 0.0
    hparams.batching_mantissa_bits = 3
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 1000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 4
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(True)
    hparams.sampling_method = "random"
    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    # comma-separated list of layer numbers.
    # At each of these layers, we replace the ffn with a mixture of experts.
    hparams.add_hparam("moe_layers", "2")
    # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
    # If moe_n2 is an integer, then use a hierarchical MoE
    #   consisting of moe_n1 groups of moe_n2 experts each.
    hparams.add_hparam("moe_n1", 64)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_hidden_size", 2048)
    hparams.add_hparam("moe_loss_coef", 1e-2)
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("residual_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("block_length", 512)
    hparams.add_hparam("answer_length_prob_train", 0.5)
    hparams.add_hparam("answer_length_infer", 1000)
    # We cannot handle long sequence at this point, so drop them, during eval.
    # This affects evaluation metrics.
    # TODO(noam): find a different workaround
    hparams.eval_drop_long_sequences = int(True)
    return hparams
Ejemplo n.º 7
0
def lstm_attention():
    """hparams for LSTM with attention."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 128
    hparams.num_hidden_layers = 2

    # Attention
    hparams.add_hparam("attn_vec_size", hparams.hidden_size)
    return hparams
Ejemplo n.º 8
0
def attention_lm_moe_base():
    """Set of hyperparameters.

  suitable for 1 gpu.
  on lm1b_16k:
     ~337M params
     1.1 steps/sec on  [GeForce GTX TITAN X]

  Returns:
    a hparams object
  """
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 1024
    hparams.batch_size = 8192
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 1000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 4
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(False)
    hparams.add_hparam("filter_size", 2948)  # Add new ones like this.
    # comma-separated list of layer numbers.
    # At each of these layers, we replace the ffn with a mixture of experts.
    hparams.add_hparam("moe_layers", "2")
    # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
    # If moe_n2 is an integer, then use a hierarchical MoE
    #   consisting of moe_n1 groups of moe_n2 experts each.
    hparams.add_hparam("moe_n1", 64)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_hidden_size", 2048)
    hparams.add_hparam("moe_loss_coef", 1e-2)
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("residual_dropout", 0.1)
    hparams.add_hparam("pos", "timing")  # timing, none
    return hparams
def attention_lm_moe_base():
  """Set of hyperparameters.

  suitable for 1 gpu.
  on lm1b_16k:
     ~337M params
     1.1 steps/sec on  [GeForce GTX TITAN X]

  Returns:
    a hparams object
  """
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 1024
  hparams.batch_size = 8192
  hparams.max_length = 256
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 1000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 4
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = int(False)
  hparams.add_hparam("filter_size", 2948)  # Add new ones like this.
  # comma-separated list of layer numbers.
  # At each of these layers, we replace the ffn with a mixture of experts.
  hparams.add_hparam("moe_layers", "2")
  # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
  # If moe_n2 is an integer, then use a hierarchical MoE
  #   consisting of moe_n1 groups of moe_n2 experts each.
  hparams.add_hparam("moe_n1", 64)
  hparams.add_hparam("moe_n2", 0)
  hparams.add_hparam("moe_hidden_size", 2048)
  hparams.add_hparam("moe_loss_coef", 1e-2)
  # attention-related flags
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("residual_dropout", 0.1)
  return hparams
Ejemplo n.º 10
0
 def testLSTMSeq2Seq(self):
   vocab_size = 9
   x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1))
   y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1))
   hparams = common_hparams.basic_params1()
   p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size,
                                                    vocab_size)
   with self.test_session() as session:
     features = {
         "inputs": tf.constant(x, dtype=tf.int32),
         "targets": tf.constant(y, dtype=tf.int32),
     }
     model = baseline.LSTMSeq2Seq(hparams, p_hparams)
     sharded_logits, _, _ = model.model_fn(features, True)
     logits = tf.concat(sharded_logits, 0)
     session.run(tf.global_variables_initializer())
     res = session.run(logits)
   self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
Ejemplo n.º 11
0
def slicenet_params1():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 768
    hparams.dropout = 0.5
    hparams.symbol_dropout = 0.2
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 4
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.add_hparam("normalizer_fn",
                       "layer")  # New ones are added like this.
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("large_kernel_size", 15)
    hparams.add_hparam("separability", -2)
    # A dilation scheme, one of _DILATION_SCHEMES.
    hparams.add_hparam("dilation_scheme", "1.1.1.1")
    # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size.
    hparams.add_hparam("kernel_scheme", "3.7.15.31")
    hparams.add_hparam("audio_compression", 8)
    hparams.add_hparam("moe_n1", 32)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_loss_coef", 1e-2)
    hparams.add_hparam("imagenet_use_2d", int(True))
    # attention-related flags
    hparams.add_hparam("attention_type", "simple")
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("sim_loss_mult", 0.0)  # Try 10.0 for experiments.
    hparams.add_hparam("attention_dropout", 0.2)
    hparams.shared_embedding_and_softmax_weights = int(True)
    return hparams
Ejemplo n.º 12
0
def slicenet_params1():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.batch_size = 1024
  hparams.hidden_size = 768
  hparams.dropout = 0.5
  hparams.symbol_dropout = 0.2
  hparams.label_smoothing = 0.1
  hparams.clip_grad_norm = 2.0
  hparams.num_hidden_layers = 4
  hparams.kernel_height = 3
  hparams.kernel_width = 1
  hparams.add_hparam("normalizer_fn", "layer")  # New ones are added like this.
  hparams.learning_rate_decay_scheme = "exp50k"
  hparams.learning_rate = 0.05
  hparams.learning_rate_warmup_steps = 3000
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 3.0
  hparams.num_sampled_classes = 0
  hparams.sampling_method = "argmax"
  hparams.optimizer_adam_epsilon = 1e-6
  hparams.optimizer_adam_beta1 = 0.85
  hparams.optimizer_adam_beta2 = 0.997
  hparams.add_hparam("large_kernel_size", 15)
  hparams.add_hparam("separability", -2)
  # A dilation scheme, one of _DILATION_SCHEMES.
  hparams.add_hparam("dilation_scheme", "1.1.1.1")
  # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size.
  hparams.add_hparam("kernel_scheme", "3.7.15.31")
  hparams.add_hparam("audio_compression", 8)
  hparams.add_hparam("moe_n1", 32)
  hparams.add_hparam("moe_n2", 0)
  hparams.add_hparam("moe_loss_coef", 1e-2)
  hparams.add_hparam("imagenet_use_2d", int(True))
  # attention-related flags
  hparams.add_hparam("attention_type", "simple")
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("sim_loss_mult", 0.0)  # Try 10.0 for experiments.
  hparams.add_hparam("attention_dropout", 0.2)
  hparams.shared_embedding_and_softmax_weights = int(True)
  return hparams
Ejemplo n.º 13
0
def multimodel_base():
    """Base parameters for MultiModel."""
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 512
    hparams.batch_size = 2048
    hparams.num_hidden_layers = 4
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.initializer_gain = 1.0
    hparams.dropout = 0.1
    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    hparams.add_hparam("large_kernel_size", 15)
    hparams.add_hparam("attention_dropout", 0.1)
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("moe_n1", 30)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_layers", "2")
    hparams.add_hparam("moe_loss_coef", 1e-2)
    hparams.add_hparam("imagenet_use_2d", int(True))
    return hparams
Ejemplo n.º 14
0
def transformer_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 512
    hparams.batch_size = 4096
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 6
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.1
    hparams.shared_embedding_and_softmax_weights = int(True)

    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("ffn_layer", "conv_hidden_relu")
    hparams.add_hparam("parameter_attention_key_channels", 0)
    hparams.add_hparam("parameter_attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("residual_dropout", 0.1)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("nbr_decoder_problems", 1)
    return hparams
Ejemplo n.º 15
0
def neural_gpu_params1():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.num_hidden_layers = 1
    hparams.hidden_size = 256
    hparams.dropout = 0.1
    hparams.label_smoothing = 0.0
    hparams.clip_grad_norm = 10.0
    hparams.num_hidden_layers = 1
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.02
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 0.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    return hparams
Ejemplo n.º 16
0
def neural_gpu_params1():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.batch_size = 1024
  hparams.num_hidden_layers = 1
  hparams.hidden_size = 256
  hparams.dropout = 0.1
  hparams.label_smoothing = 0.0
  hparams.clip_grad_norm = 10.0
  hparams.num_hidden_layers = 1
  hparams.kernel_height = 3
  hparams.kernel_width = 1
  hparams.learning_rate_decay_scheme = "exp50k"
  hparams.learning_rate = 0.02
  hparams.learning_rate_warmup_steps = 3000
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 0.0
  hparams.num_sampled_classes = 0
  hparams.sampling_method = "argmax"
  hparams.optimizer_adam_epsilon = 1e-6
  hparams.optimizer_adam_beta1 = 0.85
  hparams.optimizer_adam_beta2 = 0.997
  return hparams
Ejemplo n.º 17
0
def bytenet_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 2048
    hparams.hidden_size = 768
    hparams.dropout = 0.2
    hparams.symbol_dropout = 0.2
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 4
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("num_block_repeat", 4)
    return hparams
Ejemplo n.º 18
0
def xception_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.batch_size = 4096
  hparams.hidden_size = 768
  hparams.dropout = 0.2
  hparams.symbol_dropout = 0.2
  hparams.label_smoothing = 0.1
  hparams.clip_grad_norm = 2.0
  hparams.num_hidden_layers = 8
  hparams.kernel_height = 3
  hparams.kernel_width = 3
  hparams.learning_rate_decay_scheme = "exp50k"
  hparams.learning_rate = 0.05
  hparams.learning_rate_warmup_steps = 3000
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 3.0
  hparams.num_sampled_classes = 0
  hparams.sampling_method = "argmax"
  hparams.optimizer_adam_epsilon = 1e-6
  hparams.optimizer_adam_beta1 = 0.85
  hparams.optimizer_adam_beta2 = 0.997
  hparams.add_hparam("imagenet_use_2d", True)
  return hparams
Ejemplo n.º 19
0
def transformer_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 256
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 4000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.1
  hparams.shared_embedding_and_softmax_weights = int(True)

  hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
  # attention-related flags
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("ffn_layer", "conv_hidden_relu")
  hparams.add_hparam("parameter_attention_key_channels", 0)
  hparams.add_hparam("parameter_attention_value_channels", 0)
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("residual_dropout", 0.1)
  hparams.add_hparam("nbr_decoder_problems", 1)
  return hparams