def evolved_transformer_base_tpu():
  """Base parameters for Evolved Transformer model on TPU."""
  hparams = add_evolved_transformer_hparams(transformer.transformer_tpu())
  hparams.learning_rate_constant = 1 / hparams.learning_rate_warmup_steps ** 0.5
  hparams.learning_rate_schedule = (
      "constant*single_cycle_cos_decay")
  return hparams
Esempio n. 2
0
def afx_small():
    """Small transformer model with small batch size for fast step times."""
    hparams = transformer.transformer_tpu()
    hparams.filter_size = 1024
    hparams.num_heads = 4
    hparams.num_hidden_layers = 3
    hparams.batch_size = 512
    return hparams
def afx_small():
  """Small transformer model with small batch size for fast step times."""
  hparams = transformer.transformer_tpu()
  hparams.filter_size = 1024
  hparams.num_heads = 4
  hparams.num_hidden_layers = 3
  hparams.batch_size = 512
  return hparams
Esempio n. 4
0
def lmx_base():
  """Transformer on languagemodel_lm1b32k_packed.  50M Params."""
  hparams = transformer.transformer_tpu()
  # sharing is counterproductive when underparameterized
  hparams.shared_embedding_and_softmax_weights = False
  # we judge by log-ppl, so label smoothing hurts.
  hparams.label_smoothing = 0.0
  # This makes the batch size on GPU the same as on TPU for a packed problem
  # with sequence length 256.
  # TODO(noam): fix the mess that is the data reading pipeline.
  hparams.max_length = 256
  # larger batch since we only have a decoder
  hparams.batch_size = 4096
  # save some memory so we can have a larger model
  hparams.activation_dtype = "bfloat16"
  return hparams
Esempio n. 5
0
def lmx_base():
    """Transformer on languagemodel_lm1b32k_packed.  50M Params."""
    hparams = transformer.transformer_tpu()
    # sharing is counterproductive when underparameterized
    hparams.shared_embedding_and_softmax_weights = False
    # we judge by log-ppl, so label smoothing hurts.
    hparams.label_smoothing = 0.0
    # This makes the batch size on GPU the same as on TPU for a packed problem
    # with sequence length 256.
    # TODO(noam): fix the mess that is the data reading pipeline.
    hparams.max_length = 256
    # larger batch since we only have a decoder
    hparams.batch_size = 4096
    # save some memory so we can have a larger model
    hparams.activation_dtype = "bfloat16"
    return hparams
def neural_assistant_base():
    """HParams for a base neural_assistant model."""
    hparams = transformer.transformer_tpu()
    hparams.add_hparam("pos_weight", 1.0)  # weight for positive triples
    hparams.add_hparam("similarity_fuction",
                       "bilinear")  # dot_product or bilinear
    hparams.add_hparam("pool_technique", "average")  # avg or max pool or last
    hparams.add_hparam("last_k", 1)  # number of last indices for averaging
    hparams.add_hparam("max_triple_length", 30)  # max length of every triple
    hparams.add_hparam("train_triple_num",
                       5000)  # max number of triples during training
    hparams.add_hparam("attend_kb", True)  # if False, it's a transformer model
    hparams.add_hparam("kb_loss_weight", 0.0)  # weight for distant supervision
    hparams.add_hparam("test_triple_num", 28483)  # max triples of KB
    hparams.add_hparam("margin", 0.0)  # KB training max-margin loss
    hparams.add_hparam(
        "num_negative_samples",
        1)  # Sampling number of different adversarial training examples
    hparams.add_hparam("kb_train_weight", 0.0)
    # KB_training loss weight which combines Language model and KB selection loss
    return hparams