Ejemplo n.º 1
0
def attention_lm_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 1024
  hparams.batch_size = 8192
  hparams.max_length = 256
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 1000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = int(False)

  hparams.add_hparam("filter_size", 4096)  # Add new ones like this.
  # attention-related flags
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("residual_dropout", 0.1)
  hparams.add_hparam("pos", "timing")  # timing, none
  return hparams
Ejemplo n.º 2
0
def lstm_attention():
    """hparams for LSTM with attention."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 128
    hparams.num_hidden_layers = 2

    # Attention
    hparams.add_hparam("attn_vec_size", hparams.hidden_size)
    return hparams
Ejemplo n.º 3
0
def attention_lm_moe_base():
    """Set of hyperparameters.

  suitable for 1 gpu.
  on lm1b_16k:
     ~337M params
     1.1 steps/sec on  [GeForce GTX TITAN X]

  Returns:
    a hparams object
  """
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 1024
    hparams.batch_size = 8192
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 1000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 4
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(False)
    hparams.add_hparam("filter_size", 2948)  # Add new ones like this.
    # comma-separated list of layer numbers.
    # At each of these layers, we replace the ffn with a mixture of experts.
    hparams.add_hparam("moe_layers", "2")
    # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
    # If moe_n2 is an integer, then use a hierarchical MoE
    #   consisting of moe_n1 groups of moe_n2 experts each.
    hparams.add_hparam("moe_n1", 64)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_hidden_size", 2048)
    hparams.add_hparam("moe_loss_coef", 1e-2)
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("residual_dropout", 0.1)
    hparams.add_hparam("pos", "timing")  # timing, none
    return hparams
Ejemplo n.º 4
0
def slicenet_params1():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 768
    hparams.dropout = 0.5
    hparams.symbol_dropout = 0.2
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 4
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.add_hparam("normalizer_fn",
                       "layer")  # New ones are added like this.
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("large_kernel_size", 15)
    hparams.add_hparam("separability", -2)
    # A dilation scheme, one of _DILATION_SCHEMES.
    hparams.add_hparam("dilation_scheme", "1.1.1.1")
    # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size.
    hparams.add_hparam("kernel_scheme", "3.7.15.31")
    hparams.add_hparam("audio_compression", 8)
    hparams.add_hparam("moe_n1", 32)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_loss_coef", 1e-2)
    hparams.add_hparam("imagenet_use_2d", int(True))
    # attention-related flags
    hparams.add_hparam("attention_type", "simple")
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("sim_loss_mult", 0.0)  # Try 10.0 for experiments.
    hparams.add_hparam("attention_dropout", 0.2)
    hparams.shared_embedding_and_softmax_weights = int(True)
    return hparams
Ejemplo n.º 5
0
def multimodel_base():
  """Base parameters for MultiModel."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 512
  hparams.batch_size = 2048
  hparams.num_hidden_layers = 4
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 4000
  hparams.initializer_gain = 1.0
  hparams.dropout = 0.1
  hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
  hparams.add_hparam("large_kernel_size", 15)
  hparams.add_hparam("attention_dropout", 0.1)
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("moe_n1", 30)
  hparams.add_hparam("moe_n2", 0)
  hparams.add_hparam("moe_layers", "2")
  hparams.add_hparam("moe_loss_coef", 1e-2)
  hparams.add_hparam("imagenet_use_2d", int(True))
  return hparams
Ejemplo n.º 6
0
def neural_gpu_params1():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.num_hidden_layers = 1
    hparams.hidden_size = 256
    hparams.dropout = 0.1
    hparams.label_smoothing = 0.0
    hparams.clip_grad_norm = 10.0
    hparams.num_hidden_layers = 1
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.02
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 0.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    return hparams
Ejemplo n.º 7
0
def xception_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 4096
    hparams.hidden_size = 768
    hparams.dropout = 0.2
    hparams.symbol_dropout = 0.2
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 8
    hparams.kernel_height = 3
    hparams.kernel_width = 3
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("imagenet_use_2d", True)
    return hparams