Ejemplo n.º 1
0
 def testNeuralGPU(self):
   hparams = common_hparams.basic_params1()
   batch_size = 3
   input_length = 5
   target_length = input_length
   input_vocab_size = 9
   target_vocab_size = 11
   p_hparams = problem_hparams.test_problem_hparams(input_vocab_size,
                                                    target_vocab_size)
   inputs = -1 + np.random.random_integers(
       input_vocab_size, size=(batch_size, input_length, 1, 1))
   targets = -1 + np.random.random_integers(
       target_vocab_size, size=(batch_size, target_length, 1, 1))
   with self.test_session() as session:
     features = {
         "inputs": tf.constant(inputs, dtype=tf.int32),
         "targets": tf.constant(targets, dtype=tf.int32)
     }
     model = neural_gpu.NeuralGPU(hparams, tf.estimator.ModeKeys.TRAIN,
                                  p_hparams)
     logits, _ = model(features)
     session.run(tf.global_variables_initializer())
     res = session.run(logits)
   self.assertEqual(res.shape, (batch_size, target_length, 1, 1,
                                target_vocab_size))
Ejemplo n.º 2
0
def autoencoder_basic():
  """Basic autoencoder model."""
  hparams = common_hparams.basic_params1()
  hparams.optimizer = "Adam"
  hparams.learning_rate_constant = 0.0002
  hparams.learning_rate_warmup_steps = 500
  hparams.learning_rate_schedule = "constant * linear_warmup"
  hparams.label_smoothing = 0.0
  hparams.batch_size = 128
  hparams.hidden_size = 64
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 0.0
  hparams.kernel_height = 4
  hparams.kernel_width = 4
  hparams.dropout = 0.1
  hparams.add_hparam("max_hidden_size", 1024)
  hparams.add_hparam("bottleneck_bits", 128)
  hparams.add_hparam("bottleneck_noise", 0.1)
  hparams.add_hparam("bottleneck_warmup_steps", 3000)
  hparams.add_hparam("bottleneck_max_prob", 1.0)
  hparams.add_hparam("sample_height", 32)
  hparams.add_hparam("sample_width", 32)
  hparams.add_hparam("discriminator_batchnorm", True)
  hparams.add_hparam("num_sliced_vecs", 4096)
  hparams.add_hparam("gan_loss_factor", 0.0)
  return hparams
Ejemplo n.º 3
0
def ppo_base_v1():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.learning_rate = 1e-4
  hparams.add_hparam("init_mean_factor", 0.1)
  hparams.add_hparam("init_logstd", 0.1)
  hparams.add_hparam("policy_layers", (100, 100))
  hparams.add_hparam("value_layers", (100, 100))
  hparams.add_hparam("clipping_coef", 0.2)
  hparams.add_hparam("gae_gamma", 0.99)
  hparams.add_hparam("gae_lambda", 0.95)
  hparams.add_hparam("entropy_loss_coef", 0.01)
  hparams.add_hparam("value_loss_coef", 1)
  hparams.add_hparam("optimization_epochs", 15)
  hparams.add_hparam("epoch_length", 200)
  hparams.add_hparam("epochs_num", 2000)
  hparams.add_hparam("eval_every_epochs", 10)
  hparams.add_hparam("save_models_every_epochs", 30)
  hparams.add_hparam("optimization_batch_size", 50)
  hparams.add_hparam("max_gradients_norm", 0.5)
  hparams.add_hparam("intrinsic_reward_scale", 0.)
  hparams.add_hparam("logits_clip", 0.0)
  hparams.add_hparam("dropout_ppo", 0.1)
  hparams.add_hparam("effective_num_agents", None)
  return hparams
Ejemplo n.º 4
0
def ppo_base_v1():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.learning_rate = 1e-4
  hparams.add_hparam("init_mean_factor", 0.1)
  hparams.add_hparam("init_logstd", 0.1)
  hparams.add_hparam("policy_layers", (100, 100))
  hparams.add_hparam("value_layers", (100, 100))
  hparams.add_hparam("num_agents", 30)
  hparams.add_hparam("clipping_coef", 0.2)
  hparams.add_hparam("gae_gamma", 0.99)
  hparams.add_hparam("gae_lambda", 0.95)
  hparams.add_hparam("entropy_loss_coef", 0.01)
  hparams.add_hparam("value_loss_coef", 1)
  hparams.add_hparam("optimization_epochs", 15)
  hparams.add_hparam("epoch_length", 200)
  hparams.add_hparam("epochs_num", 2000)
  hparams.add_hparam("eval_every_epochs", 10)
  hparams.add_hparam("num_eval_agents", 3)
  hparams.add_hparam("video_during_eval", False)
  hparams.add_hparam("save_models_every_epochs", 30)
  hparams.add_hparam("optimization_batch_size", 50)
  hparams.add_hparam("max_gradients_norm", 0.5)
  hparams.add_hparam("simulated_environment", False)
  hparams.add_hparam("simulation_random_starts", False)
  hparams.add_hparam("intrinsic_reward_scale", 0.)
  return hparams
Ejemplo n.º 5
0
def revnet_base():
  """Default hparams for Revnet."""
  hparams = common_hparams.basic_params1()
  hparams.add_hparam('num_channels', [64, 128, 256, 416])
  hparams.add_hparam('num_layers_per_block', [1, 1, 10, 1])
  hparams.add_hparam('bottleneck', True)
  hparams.add_hparam('first_batch_norm', [False, True, True, True])
  hparams.add_hparam('init_stride', 2)
  hparams.add_hparam('init_kernel_size', 7)
  hparams.add_hparam('init_maxpool', True)
  hparams.add_hparam('strides', [1, 2, 2, 2])
  hparams.add_hparam('num_channels_init_block', 64)
  hparams.add_hparam('dim', '2d')

  # Variable init
  hparams.initializer = 'normal_unit_scaling'
  hparams.initializer_gain = 2.

  # Optimization
  hparams.optimizer = 'Momentum'
  hparams.optimizer_momentum_momentum = 0.9
  hparams.optimizer_momentum_nesterov = True
  hparams.weight_decay = 1e-4
  hparams.clip_grad_norm = 0.0
  # (base_lr=0.1) * (batch_size=128*8 (on TPU, or 8 GPUs)=1024) / (256.)
  hparams.learning_rate = 0.4
  hparams.learning_rate_decay_scheme = 'cosine'
  # For image_imagenet224, 120k training steps, which effectively makes this a
  # cosine decay (i.e. no cycles).
  hparams.learning_rate_cosine_cycle_steps = 120000

  # Can run with a batch size of 128 with Problem ImageImagenet224
  hparams.batch_size = 128
  return hparams
Ejemplo n.º 6
0
def attention_lm_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 1024
  hparams.batch_size = 8192
  hparams.max_length = 256
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 2000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = False

  hparams.add_hparam("filter_size", 4096)  # Add new ones like this.
  # attention-related flags
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("encoder_full_attention", False)
  return hparams
Ejemplo n.º 7
0
def resnet_base():
  """Set of hyperparameters."""
  # For imagenet on TPU:
  # Set train_steps=120000
  # Set eval_steps=48

  # Base
  hparams = common_hparams.basic_params1()

  # Model-specific parameters
  hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
  hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512])
  hparams.add_hparam("block_fn", "bottleneck")
  hparams.add_hparam("use_nchw", True)

  # Variable init
  hparams.initializer = "normal_unit_scaling"
  hparams.initializer_gain = 2.

  # Optimization
  hparams.optimizer = "Momentum"
  hparams.optimizer_momentum_momentum = 0.9
  hparams.optimizer_momentum_nesterov = True
  hparams.weight_decay = 1e-4
  hparams.clip_grad_norm = 0.0
  # (base_lr=0.1) * (batch_size=128*8 (on TPU, or 8 GPUs)=1024) / (256.)
  hparams.learning_rate = 0.4
  hparams.learning_rate_decay_scheme = "cosine"
  # For image_imagenet224, 120k training steps, which effectively makes this a
  # cosine decay (i.e. no cycles).
  hparams.learning_rate_cosine_cycle_steps = 120000

  hparams.batch_size = 128
  return hparams
Ejemplo n.º 8
0
def bluenet_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.batch_size = 4096
  hparams.hidden_size = 256
  hparams.dropout = 0.2
  hparams.symbol_dropout = 0.5
  hparams.label_smoothing = 0.1
  hparams.clip_grad_norm = 2.0
  hparams.num_hidden_layers = 8
  hparams.kernel_height = 3
  hparams.kernel_width = 3
  hparams.learning_rate_decay_scheme = "exp10k"
  hparams.learning_rate = 0.05
  hparams.learning_rate_warmup_steps = 3000
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 3.0
  hparams.num_sampled_classes = 0
  hparams.sampling_method = "argmax"
  hparams.optimizer_adam_epsilon = 1e-6
  hparams.optimizer_adam_beta1 = 0.85
  hparams.optimizer_adam_beta2 = 0.997
  hparams.add_hparam("anneal_until", 40000)
  hparams.add_hparam("batch_deviation_loss_factor", 5.0)
  return hparams
Ejemplo n.º 9
0
 def testSymbolModalityTargetsFactored(self):
   batch_size = 10
   num_datashards = 5
   length = 6
   height = 7
   hidden_size = 9
   vocab_size = 11
   model_hparams = common_hparams.basic_params1()
   model_hparams.factored_logits = True
   model_hparams.hidden_size = hidden_size
   model_hparams.mode = tf.estimator.ModeKeys.TRAIN
   body_output = -1 + np.random.random_integers(
       100, size=(batch_size, length, height, hidden_size))
   targets = -1 + np.random.random_integers(
       vocab_size, size=(batch_size, length, height, 1))
   m = modalities.SymbolModality(model_hparams, vocab_size)
   data_parallelism = expert_utils.Parallelism(
       ["/device:CPU:0"] * num_datashards)
   with self.test_session() as session:
     sharded_body_output = tf.split(tf.to_float(body_output), num_datashards)
     sharded_targets = tf.split(targets, num_datashards)
     sharded_logits = m.top_sharded(sharded_body_output, sharded_targets,
                                    data_parallelism)
     train_loss = m.loss_sharded(sharded_logits, sharded_targets,
                                 data_parallelism)
     logits = tf.concat(sharded_logits, 0)
     session.run(tf.global_variables_initializer())
     res1, res2 = session.run((logits, train_loss))
   self.assertEqual(res1.shape, (batch_size, length, height, 1, vocab_size))
   self.assertEqual(res2.shape, ())
Ejemplo n.º 10
0
 def testSymbolTupleModalityInputs(self):
   """Adapted from tensor2tensor/layers/modalities_test.py."""
   batch_size = 10
   num_datashards = 5
   length = 5
   vocab_size = [2000, 500, 2500]
   hidden_size = 9
   model_hparams = common_hparams.basic_params1()
   model_hparams.hidden_size = hidden_size
   model_hparams.mode = tf.estimator.ModeKeys.TRAIN
   x = np.stack([
       -1 + np.random.random_integers(
           vocab_size[i], size=(batch_size, length, 1))
       for i in range(len(vocab_size))
   ], axis=3)
   m = modalities.SymbolTupleModality(model_hparams, vocab_size)
   data_parallelism = expert_utils.Parallelism(
       ['/device:CPU:0'] * num_datashards)
   with self.test_session() as session:
     xs = tf.split(x, num_datashards)
     sharded_output = m.bottom_sharded(xs, data_parallelism)
     output = tf.concat(sharded_output, 0)
     session.run(tf.global_variables_initializer())
     res = session.run(output)
   self.assertEqual(res.shape, (batch_size, length, 1, hidden_size))
Ejemplo n.º 11
0
def shakeshake_cifar10():
  """Parameters for CIFAR-10."""
  tf.logging.warning("shakeshake_cifar10 hparams have not been verified to "
                     "achieve good performance.")
  hparams = common_hparams.basic_params1()
  # This leads to effective batch size 128 when number of GPUs is 1
  hparams.batch_size = 4096 * 8
  hparams.hidden_size = 16
  hparams.dropout = 0
  hparams.label_smoothing = 0.0
  hparams.clip_grad_norm = 2.0
  hparams.num_hidden_layers = 26
  hparams.kernel_height = -1  # Unused
  hparams.kernel_width = -1  # Unused
  hparams.learning_rate_decay_scheme = "cosine"
  # Model should be run for 700000 steps with batch size 128 (~1800 epochs)
  hparams.learning_rate_cosine_cycle_steps = 700000
  hparams.learning_rate = 0.2
  hparams.learning_rate_warmup_steps = 3000
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  # TODO(rshin): Adjust so that effective value becomes ~1e-4
  hparams.weight_decay = 3.0
  hparams.optimizer = "Momentum"
  hparams.optimizer_momentum_momentum = 0.9
  hparams.add_hparam("base_filters", 16)
  hparams.add_hparam("shakeshake_type", "batch")
  return hparams
Ejemplo n.º 12
0
def next_frame_base():
  """Common HParams for next_frame models."""
  hparams = common_hparams.basic_params1()
  # Loss cutoff.
  hparams.add_hparam("video_modality_loss_cutoff", 0.01)
  # Additional resizing the frames before feeding them to model.
  hparams.add_hparam("preprocess_resize_frames", None)
  # How many data points to suffle. Ideally should be part of problem not model!
  hparams.add_hparam("shuffle_buffer_size", 128)
  # Tiny mode. For faster tests.
  hparams.add_hparam("tiny_mode", False)
  # In case a model supports smaller/faster version.
  hparams.add_hparam("small_mode", False)
  # In case a model has stochastic version.
  hparams.add_hparam("stochastic_model", False)
  # Internal loss for recurrent models.
  hparams.add_hparam("internal_loss", True)
  # choose from: concat, multiplicative, multi_additive
  hparams.add_hparam("action_injection", "multi_additive")
  # Scheduled sampling method. Choose between
  # ground_truth_only, prediction_only, prob, count, prob_inverse_exp.
  hparams.add_hparam("scheduled_sampling_mode", "prediction_only")
  hparams.add_hparam("scheduled_sampling_decay_steps", 10000)
  hparams.add_hparam("scheduled_sampling_max_prob", 1.0)
  hparams.add_hparam("scheduled_sampling_k", 900.0)
  return hparams
Ejemplo n.º 13
0
def my_very_own_hparams():
  # Start with the base set
  hp = common_hparams.basic_params1()
  # Modify existing hparams
  hp.num_hidden_layers = 2
  # Add new hparams
  hp.add_hparam("filter_size", 2048)
  return hp
Ejemplo n.º 14
0
def mtf_transformer2_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()

  hparams.add_hparam("d_model", 1024)
  hparams.batch_size = 4
  hparams.max_length = 1024
  hparams.label_smoothing = 0.0
  # a small positive value - this seems important for stability when training
  # with bfloat16 activations.
  hparams.add_hparam("z_loss", 1e-4)

  # These hyperparameters are used in default_layer_stack()
  # They may not be respected if hparams uses a differet layer stack function.
  hparams.num_hidden_layers = 6
  hparams.add_hparam("d_ff", 2048)
  hparams.add_hparam("d_kv", 128)
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.layer_prepostprocess_dropout = 0.0

  # round up vocab sizes to be a multiple of this value
  hparams.vocab_divisor = 128

  hparams.optimizer = "Adafactor"
  hparams.learning_rate_schedule = "rsqrt_decay*linear_decay"
  hparams.learning_rate_warmup_steps = 10000
  hparams.add_hparam("master_dtype", "bfloat16")
  hparams.add_hparam("slice_dtype", "float32")
  hparams.activation_dtype = "bfloat16"

  # 8-way model-parallelism
  hparams.add_hparam("mesh_shape", "model:8")
  hparams.add_hparam("layout", "batch:batch;vocab:model;d_ff:model;heads:model")

  # If nonzero, we split the batch across two tensor-dimensions named
  # "outer_batch" and "inner_batch", allowing for splitting across two mesh
  # dimensions.  This is necessary for hierarchical mixture of experts.
  # The two tensor dimensions have sizes hparams.outer_batch_size and
  # hparams.batch_size // hparams.outer_batch_size.
  hparams.add_hparam("outer_batch_size", 0)

  hparams.shared_embedding_and_softmax_weights = False
  # length for training or decoding - defaults to max_length
  hparams.add_hparam("length", 0)

  # These parameters make Transformer model compatible with mtf
  # Do not override these.
  hparams.no_data_parallelism = True
  hparams.use_fixed_batch_size = True
  hparams.add_hparam("mtf_mode", True)
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.modality = {
      "inputs": modalities.IdentitySymbolModality,
      "targets": modalities.IdentitySymbolModality,
  }
  return hparams
Ejemplo n.º 15
0
def lstm2():
    """Hparams for minimal example, copied from T2T LSTM hparams."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 128
    hparams.num_hidden_layers = 2
    # uncomment this line to fix things
    # hparams.initializer = "uniform_unit_scaling"
    return hparams
Ejemplo n.º 16
0
def glow_hparams():
    """Glow Hparams."""
    hparams = common_hparams.basic_params1()
    hparams.add_hparam("n_levels", 3)
    hparams.add_hparam("n_bits_x", 8)
    hparams.add_hparam("depth", 32)
    hparams.add_hparam("affine_coupling_width", 512)
    hparams.add_hparam("learn_prior", True)
    return hparams
Ejemplo n.º 17
0
def resnet_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
    hparams.add_hparam("use_nchw", True)
    hparams.add_hparam("num_filters", [64, 128, 256, 512])
    hparams.add_hparam("strides", [1, 2, 2, 2])
    hparams.tpu_batch_size_per_shard = 48
    return hparams
Ejemplo n.º 18
0
def lstm_literature_base():
    """Set of base hyperparameters for LSTM from Jozefowicz et al."""
    hparams = common_hparams.basic_params1()
    hparams.clip_grad_norm = 1.0
    hparams.label_smoothing = 0.0
    hparams.batch_size = 2048
    hparams.optimizer = "Adagrad"
    hparams.learning_rate = 0.2
    return hparams
Ejemplo n.º 19
0
def transformer_moe_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.norm_type = "layer"
    hparams.hidden_size = 512
    hparams.batch_size = 4096
    hparams.max_length = 2001
    hparams.max_input_seq_length = 2000
    hparams.max_target_seq_length = 2000
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 5
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(True)
    # According to noam, ("n", "da") seems better for harder-to-learn models
    hparams.layer_preprocess_sequence = "n"
    hparams.layer_postprocess_sequence = "da"

    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("ffn_layer", "conv_hidden_relu")
    # Other attention types params
    hparams.add_hparam("attention_loc_block_length", 256)
    hparams.add_hparam("attention_red_factor", 3)
    hparams.add_hparam("attention_red_type", "conv")
    hparams.add_hparam("attention_red_nonlinearity", "none")
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("nbr_decoder_problems", 1)
    hparams.add_hparam("proximity_bias", int(False))

    # Decoder layers type. If set, num_decoder_layers parameter will be ignored
    # and the number of decoder layer will be deduced from the string
    # See top file comment for example of usage
    hparams.add_hparam("layer_types", "")
    # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc,
    # sep, moe,...)
    hparams.add_hparam("default_att", "a")
    hparams.add_hparam("default_ff", "fc")

    return hparams
Ejemplo n.º 20
0
def mtf_transformer_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.no_data_parallelism = True
    hparams.use_fixed_batch_size = True
    hparams.add_hparam("mtf_mode", True)
    hparams.batch_size = 64
    hparams.max_length = 256
    hparams.add_hparam("d_model", 512)
    hparams.add_hparam("d_kv", 128)
    hparams.label_smoothing = 0.1
    # 8-way model-parallelism
    hparams.add_hparam("mesh_shape", "model:8")
    hparams.add_hparam("layout",
                       "batch:batch;vocab:model;d_ff:model;heads:model")
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("d_ff", 2048)
    hparams.add_hparam("num_encoder_layers", 6)
    hparams.add_hparam("num_decoder_layers", 6)
    hparams.add_hparam("attention_dropout", 0.1)
    hparams.add_hparam("relu_dropout", 0.1)
    hparams.layer_prepostprocess_dropout = 0.1

    # round up vocab sizes to be a multiple of this value
    hparams.vocab_divisor = 128

    # mixture of experts hparams
    hparams.add_hparam("feedforward_layer", "dense_relu_dense")
    hparams.add_hparam("moe_overhead_train", 1.0)
    hparams.add_hparam("moe_overhead_eval", 2.0)
    hparams.moe_num_experts = 16
    hparams.moe_loss_coef = 1e-3

    # Use targets_embedding_var * rsqrt(d_model) as softmax_var
    hparams.shared_embedding_and_softmax_weights = True
    # Reuse targets_embedding_var as inputs_embedding_var
    hparams.shared_embedding = True
    hparams.optimizer = "Adafactor"
    hparams.learning_rate_schedule = "linear_warmup*rsqrt_decay*linear_decay"
    hparams.learning_rate_warmup_steps = 10000
    hparams.activation_dtype = "float32"

    # These parameters make Transformer model compatible with MtfTransformer
    # Do not override these, as mtf_transformer does not support other options.
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.target_modality = "symbol:identity"
    hparams.input_modalities = "inputs:symbol:identity"

    # Parameters for computing the maximum decode length in beam search.
    # Maximum decode length is:
    #    min(max_length,
    #        decode_length_multiplier * input_length + decode_length_constant)
    hparams.add_hparam("decode_length_multiplier", 1.5)
    hparams.add_hparam("decode_length_constant", 10.0)

    return hparams
Ejemplo n.º 21
0
def long_answer_base():
  """Set of hyperparameters.

  Returns:
    a hparams object
  """
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 1024
  hparams.batch_size = 8192
  hparams.max_length = 8192
  hparams.dropout = 0.0
  hparams.batching_mantissa_bits = 3
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 1000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 4
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = int(True)
  hparams.sampling_method = "random"
  hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
  # comma-separated list of layer numbers.
  # At each of these layers, we replace the ffn with a mixture of experts.
  hparams.add_hparam("moe_layers", "2")
  # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
  # If moe_n2 is an integer, then use a hierarchical MoE
  #   consisting of moe_n1 groups of moe_n2 experts each.
  hparams.add_hparam("moe_n1", 64)
  hparams.add_hparam("moe_n2", 0)
  hparams.add_hparam("moe_hidden_size", 2048)
  hparams.add_hparam("moe_loss_coef", 1e-2)
  # attention-related flags
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("residual_dropout", 0.0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("block_length", 512)
  hparams.add_hparam("answer_length_prob_train", 0.5)
  hparams.add_hparam("answer_length_infer", 1000)
  # We cannot handle long sequence at this point, so drop them, during eval.
  # This affects evaluation metrics.
  # TODO(noam): find a different workaround
  hparams.eval_drop_long_sequences = int(True)
  return hparams
Ejemplo n.º 22
0
def lstm_attention():
    """hparams for LSTM with attention."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 128
    hparams.num_hidden_layers = 2

    # Attention
    hparams.add_hparam("attn_vec_size", hparams.hidden_size)
    return hparams
Ejemplo n.º 23
0
def lstm_seq2seq():
    """hparams for LSTM."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 128
    hparams.num_hidden_layers = 2
    hparams.initializer = "uniform_unit_scaling"
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 0.0
    return hparams
Ejemplo n.º 24
0
def transformer_moe_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.norm_type = "layer"
    hparams.hidden_size = 512
    hparams.batch_size = 4096
    hparams.max_length = 2001
    hparams.max_input_seq_length = 2000
    hparams.max_target_seq_length = 2000
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 5
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(True)

    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("ffn_layer", "conv_hidden_relu")
    hparams.add_hparam("parameter_attention_key_channels", 0)
    hparams.add_hparam("parameter_attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("residual_dropout", 0.1)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("nbr_decoder_problems", 1)
    hparams.add_hparam("proximity_bias", int(False))
    # FLAGS RELATED TO MIXTURE-OF-EXPERTS
    # comma-separated list of layer numbers.
    # At each of these layers, we replace the ffn with a mixture of experts.
    hparams.add_hparam("moe_layers_encoder", "2")
    hparams.add_hparam("moe_layers_decoder", "2")
    # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
    # If moe_n2 is an integer, then use a hierarchical MoE
    #   consisting of moe_n1 groups of moe_n2 experts each.
    hparams.add_hparam("moe_n1", 32)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_hidden_size", 2048)
    hparams.add_hparam("moe_loss_coef", 1e-2)
    return hparams
Ejemplo n.º 25
0
def resnet_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
  hparams.add_hparam("use_nchw", True)
  hparams.add_hparam("num_filters", [64, 128, 256, 512])
  hparams.add_hparam("strides", [1, 2, 2, 2])

  # Can run with a batch size of 128 with Problem ImageImagenet224
  hparams.tpu_batch_size_per_shard = 128
  return hparams
Ejemplo n.º 26
0
def mtf_resnet_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.no_data_parallelism = True
    hparams.use_fixed_batch_size = True
    hparams.batch_size = 32
    hparams.max_length = 3072
    hparams.hidden_size = 256
    hparams.label_smoothing = 0.0
    # 8-way model-parallelism
    hparams.add_hparam("mesh_shape", "batch:8")
    hparams.add_hparam("layout", "batch:batch")
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("filter_size", 1024)

    hparams.add_hparam("num_layers", 6)
    hparams.add_hparam("attention_key_size", 256)
    hparams.add_hparam("attention_value_size", 256)
    # Share weights between input and target embeddings
    hparams.shared_embedding = True

    # mixture of experts hparams
    hparams.add_hparam("ffn_layer", "dense_relu_dense")
    hparams.add_hparam("moe_overhead_train", 1.0)
    hparams.add_hparam("moe_overhead_eval", 2.0)
    hparams.moe_num_experts = 16
    hparams.moe_loss_coef = 1e-3

    hparams.shared_embedding_and_softmax_weights = True
    hparams.optimizer = "Adafactor"
    hparams.learning_rate_schedule = "rsqrt_decay"
    hparams.learning_rate_warmup_steps = 10000
    hparams.add_hparam("d_kv", 32)

    # Image related hparams
    hparams.add_hparam("img_len", 32)
    hparams.add_hparam("num_channels", 3)
    hparams.add_hparam("row_blocks", 1)
    hparams.add_hparam("col_blocks", 1)
    hparams.add_hparam("rows_size", 32)
    hparams.add_hparam("cols_size", 32)

    # Model-specific parameters
    hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
    hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512])
    hparams.add_hparam("is_cifar", False)

    # Variable init
    hparams.initializer = "normal_unit_scaling"
    hparams.initializer_gain = 2.

    # TODO(nikip): Change optimization scheme?
    hparams.learning_rate = 0.4
    return hparams
Ejemplo n.º 27
0
def attention_lm_moe_base():
    """Set of hyperparameters.

  suitable for 1 gpu.
  on lm1b_32k:
     ~229M params
     0.9 steps/sec on  [GeForce GTX TITAN X]

  Returns:
    a hparams object
  """
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 1024
    hparams.batch_size = 8192
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 2000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 4
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(False)
    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    hparams.moe_num_experts = 32
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("moe_layers",
                       "2")  # comma separated list of layer numbers
    # moe params. local attention moe.
    hparams.add_hparam("attention_type", AttentionType.MULTIHEAD)
    hparams.add_hparam("attention_num_experts", 16)
    # Key, query and value dimensions for the attention
    hparams.add_hparam("attention_kq_size", 128)
    hparams.add_hparam("attention_v_size", 256)
    # Loss coef for load balancing
    hparams.add_hparam("attention_load_balance", 2e-2)
    hparams.add_hparam("diet_experts", int(False))
    hparams.add_hparam("memory_efficient_ffn", int(False))
    return hparams
Ejemplo n.º 28
0
def lstm_seq2seq():
  """hparams for LSTM."""
  hparams = common_hparams.basic_params1()
  hparams.daisy_chain_variables = False
  hparams.batch_size = 1024
  hparams.hidden_size = 128
  hparams.num_hidden_layers = 2
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 0.0
  return hparams
Ejemplo n.º 29
0
def resnet_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
    hparams.add_hparam("use_nchw", True)
    hparams.add_hparam("num_filters", [64, 128, 256, 512])
    hparams.add_hparam("strides", [1, 2, 2, 2])

    # Can run with a batch size of 128 with Problem ImageImagenet224
    hparams.tpu_batch_size_per_shard = 128
    return hparams
Ejemplo n.º 30
0
def transformer_layerbylayer_default():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  #hparams.batch_size = 4096
  hparams.batch_size = 8192
  hparams.max_length = 256
  hparams.dropout = 0.0
  #hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.clip_grad_norm = 5.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 4000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.1
  #hparams.shared_embedding_and_softmax_weights = int(True)
  hparams.shared_embedding_and_softmax_weights = int(False)

  # Add new ones like this.
  hparams.add_hparam("filter_size", 2048)
  # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers.
  hparams.add_hparam("num_encoder_layers", 0)
  hparams.add_hparam("num_decoder_layers", 0)
  # Attention-related flags.
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("ffn_layer", "conv_hidden_relu")
  hparams.add_hparam("parameter_attention_key_channels", 0)
  hparams.add_hparam("parameter_attention_value_channels", 0)
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("nbr_decoder_problems", 1)
  hparams.add_hparam("proximity_bias", int(False))
  hparams.add_hparam("use_pad_remover", int(True))
  hparams.add_hparam("self_attention_type", "dot_product")
  hparams.add_hparam("max_relative_position", 0)

  # Layerbylayer defaults
  hparams.add_hparam("target_root_attention", "pop")
  hparams.add_hparam("use_loss_mask", int(True))
  hparams.add_hparam("target_root_input", "each") # 'each', 'first', 'last'
  return hparams
Ejemplo n.º 31
0
def vanilla_gan():
  """Basic parameters for a vanilla_gan."""
  hparams = common_hparams.basic_params1()

  hparams.batch_size = 32
  hparams.label_smoothing = 0.0
  hparams.add_hparam("hidden_dim", 128)
  hparams.add_hparam("random_sample_size", 100)
  hparams.add_hparam("height", 28)
  hparams.add_hparam("width", 28)
  hparams.add_hparam("epsilon", 1e-4)
  return hparams
Ejemplo n.º 32
0
def basic_fc_small():
    """Small fully connected model."""
    hparams = common_hparams.basic_params1()
    hparams.learning_rate = 0.1
    hparams.batch_size = 128
    hparams.hidden_size = 256
    hparams.num_hidden_layers = 2
    hparams.initializer = "uniform_unit_scaling"
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 0.0
    hparams.dropout = 0.0
    return hparams
Ejemplo n.º 33
0
def gene_expression_conv_base():
    """Hparams for GeneExpressionConv model."""
    hparams = common_hparams.basic_params1()
    hparams.add_hparam("num_conv_layers", 4)
    hparams.add_hparam("num_dconv_layers", 7)
    hparams.add_hparam("pooling_windows", [2, 4, 4, 4])

    # TODO(rsepassi): Correct the values of these hyperparameters
    hparams.hidden_size = 128
    hparams.kernel_width = 128
    hparams.add_hparam("stride", 1)
    return hparams
def hparams_set_up(problem_name,
                   data_dir,
                   hparam_set=None,
                   hparams_override=None):
    if hparam_set:
        hparams = trainer_lib.create_hparams(
            hparam_set, hparams_overrides_str=hparams_override)
    else:
        hparams = common_hparams.basic_params1()
    hparams.data_dir = data_dir
    hparams_lib.add_problem_hparams(hparams, problem_name)
    return hparams, hparams.problem
Ejemplo n.º 35
0
def vanilla_gan():
    """Basic parameters for a vanilla_gan."""
    hparams = common_hparams.basic_params1()

    hparams.batch_size = 32
    hparams.label_smoothing = 0.0
    hparams.add_hparam("hidden_dim", 128)
    hparams.add_hparam("random_sample_size", 100)
    hparams.add_hparam("height", 28)
    hparams.add_hparam("width", 28)
    hparams.add_hparam("epsilon", 1e-4)
    return hparams
Ejemplo n.º 36
0
def vanilla_gan():
  """Basic parameters for a vanilla_gan."""
  hparams = common_hparams.basic_params1()
  hparams.label_smoothing = 0.0
  hparams.hidden_size = 128
  hparams.batch_size = 64
  hparams.add_hparam("z_size", 64)
  hparams.add_hparam("c_dim", 1)
  hparams.add_hparam("height", 28)
  hparams.add_hparam("width", 28)
  hparams.add_hparam("discriminator_batchnorm", int(True))
  return hparams
Ejemplo n.º 37
0
def basic_fc_small():
  """Small fully connected model."""
  hparams = common_hparams.basic_params1()
  hparams.learning_rate = 0.1
  hparams.batch_size = 128
  hparams.hidden_size = 256
  hparams.num_hidden_layers = 2
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 0.0
  hparams.dropout = 0.0
  return hparams
Ejemplo n.º 38
0
def attention_lm_moe_base():
    """Set of hyperparameters.

  suitable for 1 gpu.
  on lm1b_32k:
     ~229M params
     0.9 steps/sec on  [GeForce GTX TITAN X]

  Returns:
    a hparams object
  """
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 1024
    hparams.batch_size = 8192
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 2000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 4
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(False)
    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    # comma-separated list of layer numbers.
    # At each of these layers, we replace the ffn with a mixture of experts.
    hparams.add_hparam("moe_layers", "2")
    # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
    # If moe_n2 is an integer, then use a hierarchical MoE
    #   consisting of moe_n1 groups of moe_n2 experts each.
    hparams.add_hparam("moe_n1", 32)
    hparams.add_hparam("moe_n2", 0)
    hparams.add_hparam("moe_hidden_size", 2048)
    hparams.add_hparam("moe_loss_coef", 1e-2)
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("residual_dropout", 0.1)
    hparams.add_hparam("pos", "timing")  # timing, none
    return hparams
Ejemplo n.º 39
0
def mtf_toy_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.no_data_parallelism = True
    hparams.use_fixed_batch_size = True
    hparams.add_hparam("mtf_mode", True)
    hparams.batch_size = 64
    set_adafactor_optimizer(hparams)
    hparams.add_hparam("io_size", 32)
    hparams.hidden_size = 32
    hparams.add_hparam("mesh_shape", "4.2")
    hparams.add_hparam("layout", "batch:0;hidden:1")
    return hparams
Ejemplo n.º 40
0
def glow_hparams():
    """Glow Hparams."""
    hparams = common_hparams.basic_params1()
    hparams.clip_grad_norm = None
    hparams.weight_decay = 0.0
    hparams.learning_rate_constant = 3e-4
    hparams.batch_size = 32
    hparams.add_hparam("n_levels", 3)
    hparams.add_hparam("n_bits_x", 8)
    hparams.add_hparam("depth", 32)
    hparams.add_hparam("affine_coupling_width", 512)
    hparams.add_hparam("learn_prior", True)
    return hparams
Ejemplo n.º 41
0
def lstm_attention_my():
  """hparams for LSTM with attention."""
  hparams = common_hparams.basic_params1()
  hparams.batch_size = 512
  hparams.hidden_size = 128
  hparams.num_hidden_layers = 2
  hparams.max_length = 100
  hparams.dropout=0.8
  hparams.learning_rate = 0.001

  # Attention
  hparams.add_hparam("attn_vec_size", hparams.hidden_size)
  return hparams
Ejemplo n.º 42
0
def continuous_autoencoder_basic():
  """Basic autoencoder model."""
  hparams = common_hparams.basic_params1()
  hparams.optimizer = "adam"
  hparams.learning_rate_constant = 0.0002
  hparams.learning_rate_warmup_steps = 500
  hparams.learning_rate_schedule = "constant * linear_warmup"
  hparams.label_smoothing = 0.0
  hparams.batch_size = 128
  hparams.hidden_size = 64
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 0.0
  hparams.kernel_height = 4
  hparams.kernel_width = 4
  hparams.dropout = 0.05
  hparams.add_hparam("max_hidden_size", 1024)
  hparams.add_hparam("bottleneck_bits", 128)
  hparams.add_hparam("bottleneck_shared_bits", 0)
  hparams.add_hparam("bottleneck_shared_bits_start_warmup", 0)
  hparams.add_hparam("bottleneck_shared_bits_stop_warmup", 0)
  hparams.add_hparam("bottleneck_noise", 0.1)
  hparams.add_hparam("bottleneck_warmup_steps", 2000)
  hparams.add_hparam("sample_height", 32)
  hparams.add_hparam("sample_width", 32)
  hparams.add_hparam("bottleneck_l2_factor", 0.05)
  hparams.add_hparam("gumbel_temperature", 0.5)
  hparams.add_hparam("gumbel_noise_factor", 0.5)
  hparams.add_hparam("vq_temperature", 0.001)
  hparams.add_hparam("gan_loss_factor", 0.0)

  # hparams related to the PSF
  hparams.add_hparam("encode_psf", True) # Should we use the PSF at the encoder
  hparams.add_hparam("apply_psf", True)  # Should we apply the PSF at the decoder
  hparams.add_hparam("psf_convolution_pad_factor", 0.)  # Zero padding factor for convolution

  # hparams related to output apodization for Fourier purposes
  hparams.add_hparam("output_apodization", 8)  # Number of pixels at the border affected by the apodization window
  hparams.add_hparam("apodization_loss", 1.0)  # Factor to penalize non zero borders

  # hparams related to output activation
  hparams.add_hparam("output_activation", 'softplus') # either none or softplus

  # hparams related to additional regularization of the output
  hparams.add_hparam("total_variation_loss", 0.001) # Factor to apply to a loss penalizing the TV of the unconvolved image

  # hparams related to the likelihood
  hparams.add_hparam("likelihood_type", "Fourier") # Pixel or Fourier
  hparams.add_hparam("noise_rms", 0.03) # Value of noise RMS, used for diagonal likelihood
  return hparams
Ejemplo n.º 43
0
def transformer_symshard_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 256
  hparams.batch_size = 2048
  hparams.max_length = 0
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.layer_prepostprocess_dropout = 0.2
  hparams.add_hparam("attention_dropout", 0.1)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("relu_dropout_broadcast_dims", "1")
  hparams.layer_prepostprocess_dropout = 0.1
  hparams.layer_prepostprocess_dropout_broadcast_dims = "1"  # length
  hparams.label_smoothing = 0.1
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer = "Adafactor"
  hparams.learning_rate_schedule = "rsqrt_decay"
  hparams.learning_rate_warmup_steps = 10000
  hparams.initializer_gain = 1.0
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  # TODO(noam): use this to control sharing.  We now share always
  hparams.shared_embedding_and_softmax_weights = True
  # we only want one data shard.
  hparams.no_data_parallelism = True
  # bypass the symbol modality so that we can use model parallelism.
  hparams.bottom = {
      "inputs": modalities.identity_bottom,
      "targets": modalities.identity_bottom,
  }
  hparams.top = {
      "targets": modalities.identity_top,
  }
  hparams.add_hparam("filter_size", 1280)
  hparams.add_hparam("mix_fraction", 0.5)
  # attention-related flags
  hparams.add_hparam("multihead_attention_num_heads", 4)
  hparams.add_hparam("multihead_attention_key_channels", 0)
  hparams.add_hparam("multihead_attention_value_channels", 0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam(
      "encoder_layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d")
  hparams.add_hparam(
      "decoder_layers",
      ("n,att,m,d,a," "n,enc-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d")
  # Number of model shards - each one has separate parameters.
  # Changing this number invalidates checkpoints.
  hparams.add_hparam("num_model_shards", 8)
  return hparams
Ejemplo n.º 44
0
def vqa_attention_base():
    """VQA attention baseline hparams."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 2
    hparams.use_fixed_batch_size = True,
    hparams.optimizer = "Adam"
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.999
    hparams.optimizer_adam_epsilon = 1e-8
    hparams.weight_decay = 0
    hparams.clip_grad_norm = 0.
    hparams.initializer = "uniform_unit_scaling"
    hparams.initializer_gain = 2.
    hparams.learning_rate = 0.5
    hparams.learning_rate_schedule = "legacy"
    hparams.learning_rate_warmup_steps = 0
    hparams.learning_rate_decay_scheme = "exp"
    hparams.learning_rate_decay_rate = 0.5
    hparams.learning_rate_decay_steps = 50000

    # not used hparams
    hparams.label_smoothing = 0.
    hparams.multiply_embedding_mode = ""

    hparams.dropout = 0.5
    hparams.norm_type = "layer"
    hparams.layer_postprocess_sequence = "nd"
    hparams.layer_prepostprocess_dropout = 0.5

    # add new hparams
    # preprocess
    hparams.add_hparam("resize_side", 512)
    hparams.add_hparam("height", 448)
    hparams.add_hparam("width", 448)
    hparams.add_hparam("distort", True)

    hparams.add_hparam("train_resnet", False)
    hparams.add_hparam("rnn_type", "lstm")
    hparams.add_hparam("num_rnn_layers", 1)
    hparams.add_hparam("max_question_length", 15)
    # lstm hidden size
    hparams.hidden_size = 512

    hparams.add_hparam("attn_dim", 512)
    hparams.add_hparam("num_glimps", 2)

    hparams.add_hparam("num_mlp_layers", 1)
    hparams.add_hparam("mlp_dim", 1024)

    return hparams
Ejemplo n.º 45
0
def transformer_base_v1():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 256
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_schedule = "legacy"
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 4000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.1
  hparams.shared_embedding_and_softmax_weights = True
  hparams.symbol_modality_num_shards = 16

  # Add new ones like this.
  hparams.add_hparam("filter_size", 2048)
  # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers.
  hparams.add_hparam("num_encoder_layers", 0)
  hparams.add_hparam("num_decoder_layers", 0)
  # Attention-related flags.
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("ffn_layer", "dense_relu_dense")
  hparams.add_hparam("parameter_attention_key_channels", 0)
  hparams.add_hparam("parameter_attention_value_channels", 0)
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("attention_dropout_broadcast_dims", "")
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("relu_dropout_broadcast_dims", "")
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("nbr_decoder_problems", 1)
  hparams.add_hparam("proximity_bias", False)
  hparams.add_hparam("use_pad_remover", True)
  hparams.add_hparam("self_attention_type", "dot_product")
  hparams.add_hparam("max_relative_position", 0)
  return hparams
Ejemplo n.º 46
0
def super_lm_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 512
    hparams.moe_hidden_sizes = "512"
    hparams.batch_size = 16384
    hparams.max_length = 0
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.layer_prepostprocess_dropout = 0.0
    hparams.symbol_dropout = 0.1
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.label_smoothing = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer = "Adafactor"
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 8000
    hparams.initializer_gain = 1.0
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.shared_embedding_and_softmax_weights = False
    hparams.layer_preprocess_sequence = "n"
    hparams.layer_postprocess_sequence = "da"
    # we only want one data shard.
    hparams.no_data_parallelism = True
    # bypass the symbol modality so that we can use model parallelism.
    hparams.bottom = {
        "inputs": modalities.identity_bottom,
        "targets": modalities.identity_bottom,
    }
    hparams.top = {
        "targets": modalities.identity_top,
    }
    hparams.add_hparam("filter_size", 512)
    hparams.add_hparam("mix_fraction", 0.5)
    # attention-related flags
    hparams.add_hparam("multihead_attention_num_heads", 4)
    hparams.add_hparam("multihead_attention_key_channels", 0)
    hparams.add_hparam("multihead_attention_value_channels", 0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("layers", ("n,att,m,d,a,"
                                  "n,ffn,m,d,a,") * 4 + "n,ffn,d")
    # Number of model shards - each one has separate parameters.
    # Changing this number invalidates checkpoints.
    hparams.add_hparam("num_model_shards", 8)
    hparams.add_hparam("diet_experts", False)
    return hparams
Ejemplo n.º 47
0
def transformer_base_v1():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 256
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_schedule = "linear_warmup_rsqrt_decay"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 4000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.1
  hparams.shared_embedding_and_softmax_weights = True
  hparams.symbol_modality_num_shards = 16

  # Add new ones like this.
  hparams.add_hparam("filter_size", 2048)
  # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers.
  hparams.add_hparam("num_encoder_layers", 0)
  hparams.add_hparam("num_decoder_layers", 0)
  # Attention-related flags.
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("ffn_layer", "dense_relu_dense")
  hparams.add_hparam("parameter_attention_key_channels", 0)
  hparams.add_hparam("parameter_attention_value_channels", 0)
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("attention_dropout_broadcast_dims", "")
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("relu_dropout_broadcast_dims", "")
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("nbr_decoder_problems", 1)
  hparams.add_hparam("proximity_bias", False)
  hparams.add_hparam("use_pad_remover", True)
  hparams.add_hparam("self_attention_type", "dot_product")
  hparams.add_hparam("max_relative_position", 0)
  return hparams
Ejemplo n.º 48
0
def glow_hparams():
    """Glow Hparams."""
    hparams = common_hparams.basic_params1()
    hparams.clip_grad_norm = None
    hparams.weight_decay = 0.0
    hparams.learning_rate_constant = 3e-4
    hparams.batch_size = 32
    # can be prev_level, prev_step or normal.
    # see: glow_ops.merge_level_and_latent_dist
    hparams.add_hparam("level_prior_scale", "prev_level")
    hparams.add_hparam("n_levels", 3)
    hparams.add_hparam("n_bits_x", 8)
    hparams.add_hparam("depth", 32)
    hparams.add_hparam("affine_coupling_width", 512)
    hparams.add_hparam("top_prior", "single_conv")
    return hparams
Ejemplo n.º 49
0
def transformer_symshard_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 256
  hparams.batch_size = 2048
  hparams.max_length = 0
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.layer_prepostprocess_dropout = 0.2
  hparams.add_hparam("attention_dropout", 0.1)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("relu_dropout_broadcast_dims", "1")
  hparams.layer_prepostprocess_dropout = 0.1
  hparams.layer_prepostprocess_dropout_broadcast_dims = "1"  # length
  hparams.label_smoothing = 0.1
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer = "Adafactor"
  hparams.learning_rate_schedule = "rsqrt_decay"
  hparams.learning_rate_warmup_steps = 10000
  hparams.initializer_gain = 1.0
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  # TODO(noam): use this to control sharing.  We now share always
  hparams.shared_embedding_and_softmax_weights = True
  # we only want one data shard.
  hparams.no_data_parallelism = True
  # bypass the symbol modality so that we can use model parallelism.
  hparams.modality = {
      "inputs": modalities.IdentitySymbolModality,
      "targets": modalities.IdentitySymbolModality,
  }
  hparams.add_hparam("filter_size", 1280)
  hparams.add_hparam("mix_fraction", 0.5)
  # attention-related flags
  hparams.add_hparam("multihead_attention_num_heads", 4)
  hparams.add_hparam("multihead_attention_key_channels", 0)
  hparams.add_hparam("multihead_attention_value_channels", 0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam(
      "encoder_layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d")
  hparams.add_hparam(
      "decoder_layers",
      ("n,att,m,d,a," "n,enc-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d")
  # Number of model shards - each one has separate parameters.
  # Changing this number invalidates checkpoints.
  hparams.add_hparam("num_model_shards", 8)
  return hparams
Ejemplo n.º 50
0
def mtf_image_transformer_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.no_data_parallelism = True
  hparams.use_fixed_batch_size = True
  hparams.batch_size = 1
  hparams.max_length = 3072
  hparams.hidden_size = 256
  hparams.label_smoothing = 0.0
  # 8-way model-parallelism
  hparams.add_hparam("mesh_shape", "batch:8")
  hparams.add_hparam("layout", "batch:batch")
  hparams.add_hparam("mtf_mode", True)
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("filter_size", 1024)
  hparams.add_hparam("num_encoder_layers", 0)
  hparams.add_hparam("num_decoder_layers", 6)
  hparams.add_hparam("attention_key_size", 256)
  hparams.add_hparam("attention_value_size", 256)
  # Share weights between input and target embeddings
  hparams.shared_embedding = True

  # mixture of experts hparams
  hparams.add_hparam("ffn_layer", "dense_relu_dense")
  hparams.add_hparam("moe_overhead_train", 1.0)
  hparams.add_hparam("moe_overhead_eval", 2.0)
  hparams.moe_num_experts = 16
  hparams.moe_loss_coef = 1e-3

  hparams.shared_embedding_and_softmax_weights = True
  hparams.optimizer = "Adafactor"
  hparams.learning_rate_schedule = "rsqrt_decay"
  hparams.learning_rate_warmup_steps = 10000
  hparams.add_hparam("d_kv", 64)
  hparams.add_hparam("d_ff", 2048)

  # Image related hparams
  hparams.add_hparam("img_len", 32)
  hparams.add_hparam("num_channels", 3)
  hparams.add_hparam("unconditional", True)

  # Local Attention related params
  hparams.add_hparam("block_length", 128)
  hparams.add_hparam("block_height", 16)
  hparams.add_hparam("block_width", 16)
  hparams.add_hparam("attention_type", "local1d")
  return hparams
Ejemplo n.º 51
0
def revnet_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.add_hparam('num_channels_first', [64, 128, 256, 416])
  hparams.add_hparam('num_channels_second', [256, 512, 1024, 1664])
  hparams.add_hparam('num_layers_per_block', [1, 1, 10, 1])
  hparams.add_hparam('first_batch_norm', [False, True, True, True])
  hparams.add_hparam('strides', [1, 2, 2, 2])
  hparams.add_hparam('num_channels_init_block', 32)
  hparams.add_hparam('dim', '2d')

  hparams.optimizer = 'Momentum'
  hparams.learning_rate = 0.01
  hparams.weight_decay = 1e-4
  # Can run with a batch size of 128 with Problem ImageImagenet224
  hparams.tpu_batch_size_per_shard = 128
  return hparams
Ejemplo n.º 52
0
def transformer_moe_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 2001
  hparams.max_input_seq_length = 2000
  hparams.max_target_seq_length = 2000
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 2000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = True
  # According to noam, ("n", "da") seems better for harder-to-learn models
  hparams.layer_preprocess_sequence = "n"
  hparams.layer_postprocess_sequence = "da"

  # Hparams used by transformer_prepare_decoder() function
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("proximity_bias", False)
  hparams.add_hparam("causal_decoder_self_attention", True)

  hparams = common_attention.add_standard_attention_hparams(hparams)

  # Decoder layers type. If set, num_decoder_layers parameter will be ignored
  # and the number of decoder layer will be deduced from the string
  # See top file comment for example of usage
  hparams.add_hparam("layer_types", "")
  # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc,
  # sep, moe,...)
  hparams.add_hparam("default_att", "a")
  hparams.add_hparam("default_ff", "fc")

  return hparams
Ejemplo n.º 53
0
def multimodel_base():
  """Base parameters for MultiModel."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 512
  hparams.batch_size = 2048
  hparams.num_hidden_layers = 4
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 4000
  hparams.initializer_gain = 1.0
  hparams.dropout = 0.1
  hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
  hparams.add_hparam("large_kernel_size", 15)
  hparams.add_hparam("attention_dropout", 0.1)
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("moe_layers", "2")
  hparams.moe_num_experts = 30
  return hparams
Ejemplo n.º 54
0
def mtf_resnet_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.no_data_parallelism = True
  hparams.use_fixed_batch_size = True
  hparams.batch_size = 32
  hparams.max_length = 3072
  hparams.hidden_size = 256
  hparams.label_smoothing = 0.0
  # 8-way model-parallelism
  hparams.add_hparam("mesh_shape", "batch:8")
  hparams.add_hparam("layout", "batch:batch")
  hparams.add_hparam("filter_size", 1024)

  hparams.add_hparam("num_layers", 6)
  # Share weights between input and target embeddings
  hparams.shared_embedding = True

  hparams.shared_embedding_and_softmax_weights = True
  hparams.optimizer = "Adafactor"
  hparams.learning_rate_schedule = "rsqrt_decay"
  hparams.learning_rate_warmup_steps = 10000
  hparams.add_hparam("d_kv", 32)

  # Image related hparams
  hparams.add_hparam("img_len", 32)
  hparams.add_hparam("num_channels", 3)
  hparams.add_hparam("row_blocks", 1)
  hparams.add_hparam("col_blocks", 1)
  hparams.add_hparam("rows_size", 32)
  hparams.add_hparam("cols_size", 32)

  # Model-specific parameters
  hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
  hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512])
  hparams.add_hparam("is_cifar", False)

  # Variable init
  hparams.initializer = "normal_unit_scaling"
  hparams.initializer_gain = 2.

  # TODO(nikip): Change optimization scheme?
  hparams.learning_rate = 0.1
  return hparams
Ejemplo n.º 55
0
def sliced_gan():
  """Basic parameters for a vanilla_gan."""
  hparams = common_hparams.basic_params1()
  hparams.optimizer = "Adam"
  hparams.learning_rate_constant = 0.0002
  hparams.learning_rate_warmup_steps = 500
  hparams.learning_rate_schedule = "constant * linear_warmup"
  hparams.label_smoothing = 0.0
  hparams.batch_size = 128
  hparams.hidden_size = 128
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 1e-6
  hparams.kernel_height = 4
  hparams.kernel_width = 4
  hparams.bottleneck_bits = 128
  hparams.add_hparam("discriminator_batchnorm", True)
  hparams.add_hparam("num_sliced_vecs", 4096)
  return hparams
Ejemplo n.º 56
0
def autoencoder_basic():
  """Basic autoencoder model."""
  hparams = common_hparams.basic_params1()
  hparams.optimizer = "Adam"
  hparams.learning_rate_constant = 0.0002
  hparams.learning_rate_warmup_steps = 500
  hparams.learning_rate_schedule = "constant * linear_warmup"
  hparams.label_smoothing = 0.0
  hparams.batch_size = 128
  hparams.hidden_size = 64
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  hparams.weight_decay = 0.0
  hparams.kernel_height = 4
  hparams.kernel_width = 4
  hparams.dropout = 0.05
  hparams.add_hparam("max_hidden_size", 1024)
  hparams.add_hparam("bottleneck_bits", 128)
  hparams.add_hparam("bottleneck_shared_bits", 0)
  hparams.add_hparam("bottleneck_shared_bits_start_warmup", 0)
  hparams.add_hparam("bottleneck_shared_bits_stop_warmup", 0)
  hparams.add_hparam("bottleneck_noise", 0.1)
  hparams.add_hparam("bottleneck_warmup_steps", 2000)
  hparams.add_hparam("sample_height", 32)
  hparams.add_hparam("sample_width", 32)
  hparams.add_hparam("discriminator_batchnorm", True)
  hparams.add_hparam("num_sliced_vecs", 20000)
  hparams.add_hparam("sliced_do_tanh", int(True))
  hparams.add_hparam("discriminator_size", 256)
  hparams.add_hparam("discriminator_kernel_size", 6)
  hparams.add_hparam("discriminator_strides", 4)
  hparams.add_hparam("discriminator_pure_mean", int(False))
  hparams.add_hparam("code_loss_factor", 1.0)
  hparams.add_hparam("gan_codes_warmup_steps", 16000)
  hparams.add_hparam("gan_loss_factor", 0.0)
  hparams.add_hparam("bottleneck_l2_factor", 0.05)
  hparams.add_hparam("gumbel_temperature", 0.5)
  hparams.add_hparam("gumbel_noise_factor", 0.5)
  hparams.add_hparam("vq_temperature", 0.001)
  hparams.add_hparam("use_vq_loss", int(False))
  hparams.add_hparam("discriminator", "double")
  return hparams
Ejemplo n.º 57
0
def super_lm_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 512
  hparams.moe_hidden_sizes = "512"
  hparams.batch_size = 16384
  hparams.max_length = 0
  # All hyperparameters ending in "dropout" are automatically set to 0.0
  # when not in training mode.
  hparams.layer_prepostprocess_dropout = 0.0
  hparams.symbol_dropout = 0.1
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.label_smoothing = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer = "Adafactor"
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 8000
  hparams.initializer_gain = 1.0
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.shared_embedding_and_softmax_weights = False
  hparams.layer_preprocess_sequence = "n"
  hparams.layer_postprocess_sequence = "da"
  # we only want one data shard.
  hparams.no_data_parallelism = True
  # bypass the symbol modality so that we can use model parallelism.
  hparams.target_modality = "symbol:identity"
  hparams.add_hparam("filter_size", 512)
  hparams.add_hparam("mix_fraction", 0.5)
  # attention-related flags
  hparams.add_hparam("multihead_attention_num_heads", 4)
  hparams.add_hparam("multihead_attention_key_channels", 0)
  hparams.add_hparam("multihead_attention_value_channels", 0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam(
      "layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 4 + "n,ffn,d")
  # Number of model shards - each one has separate parameters.
  # Changing this number invalidates checkpoints.
  hparams.add_hparam("num_model_shards", 8)
  hparams.add_hparam("diet_experts", False)
  return hparams
Ejemplo n.º 58
0
 def testSymbolModalityInputs(self):
   batch_size = 10
   num_datashards = 5
   length = 5
   vocab_size = 5000
   hidden_size = 9
   model_hparams = common_hparams.basic_params1()
   model_hparams.hidden_size = hidden_size
   model_hparams.mode = tf.estimator.ModeKeys.TRAIN
   x = -1 + np.random.random_integers(
       vocab_size, size=(batch_size, length, 1, 1))
   m = modalities.SymbolModality(model_hparams, vocab_size)
   data_parallelism = expert_utils.Parallelism(
       ["/device:CPU:0"] * num_datashards)
   xs = tf.split(x, num_datashards)
   sharded_output = m.bottom_sharded(xs, data_parallelism)
   output = tf.concat(sharded_output, 0)
   self.evaluate(tf.global_variables_initializer())
   res = self.evaluate(output)
   self.assertEqual(res.shape, (batch_size, length, 1, hidden_size))
Ejemplo n.º 59
0
def ppo_base_v1():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.learning_rate = 1e-4
  hparams.add_hparam("init_mean_factor", 0.1)
  hparams.add_hparam("init_logstd", 0.1)
  hparams.add_hparam("policy_layers", (100, 100))
  hparams.add_hparam("value_layers", (100, 100))
  hparams.add_hparam("num_agents", 30)
  hparams.add_hparam("clipping_coef", 0.2)
  hparams.add_hparam("gae_gamma", 0.99)
  hparams.add_hparam("gae_lambda", 0.95)
  hparams.add_hparam("entropy_loss_coef", 0.01)
  hparams.add_hparam("value_loss_coef", 1)
  hparams.add_hparam("optimization_epochs", 15)
  hparams.add_hparam("epoch_length", 200)
  hparams.add_hparam("epochs_num", 2000)
  hparams.add_hparam("eval_every_epochs", 10)
  hparams.add_hparam("num_eval_agents", 3)
  hparams.add_hparam("video_during_eval", True)
  return hparams
Ejemplo n.º 60
0
def gene_expression_conv_base():
  """Hparams for GeneExpressionConv model."""
  hparams = common_hparams.basic_params1()

  batch_size = 10
  output_length = 2048
  inputs_per_output = 128
  chunk_size = 4
  input_length = output_length * inputs_per_output // chunk_size
  hparams.batch_size = input_length * batch_size

  hparams.dropout = 0.1
  hparams.add_hparam("num_conv_layers", 4)
  hparams.add_hparam("num_dconv_layers", 7)
  # The product of these pooling windows should match
  # input_length/target_length.
  hparams.add_hparam("pooling_windows", [2, 2, 2, 4])

  hparams.hidden_size = 256
  hparams.kernel_width = 20
  hparams.add_hparam("stride", 1)
  return hparams