def bluenet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 4096 hparams.hidden_size = 256 hparams.dropout = 0.2 hparams.symbol_dropout = 0.5 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 8 hparams.kernel_height = 3 hparams.kernel_width = 3 hparams.learning_rate_decay_scheme = "exp10k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("anneal_until", 40000) hparams.add_hparam("batch_deviation_loss_factor", 5.0) return hparams
def attention_lm_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(False) hparams.add_hparam("filter_size", 4096) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("encoder_full_attention", int(False)) return hparams
def shakeshake_cifar10(): """Parameters for CIFAR-10.""" hparams = common_hparams.basic_params1() # This leads to effective batch size 128 when number of GPUs is 1 hparams.batch_size = 4096 * 8 hparams.hidden_size = 16 hparams.dropout = 0 hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 26 hparams.kernel_height = -1 # Unused hparams.kernel_width = -1 # Unused hparams.learning_rate_decay_scheme = "cosine" # Model should be run for 700000 steps with batch size 128 (~1800 epochs) hparams.learning_rate_cosine_cycle_steps = 700000 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 3000 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 # TODO(rshin): Adjust so that effective value becomes ~1e-4 hparams.weight_decay = 3.0 hparams.optimizer = "Momentum" hparams.optimizer_momentum_momentum = 0.9 hparams.add_hparam("base_filters", 16) hparams.add_hparam("shakeshake_type", "batch") return hparams
def testNeuralGPU(self): hparams = common_hparams.basic_params1() batch_size = 3 input_length = 5 target_length = input_length input_vocab_size = 9 target_vocab_size = 11 p_hparams = problem_hparams.test_problem_hparams( input_vocab_size, target_vocab_size) inputs = -1 + np.random.random_integers( input_vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( target_vocab_size, size=(batch_size, target_length, 1, 1)) with self.test_session() as session: features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32) } model = neural_gpu.NeuralGPU(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) shadred_logits, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (batch_size, target_length, 1, 1, target_vocab_size))
def aligned_base(): """Set of hyperparameters. languagemodel_wiki_scramble1k50, 1gpu, 7k steps (10min): log(ppl)_eval = 2.60 12.0 steps/sec on P100 8gpu (8x batch), 7k steps: log(ppl)_eval = 2.00 Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.batch_size = 5000 hparams.max_length = 0 hparams.min_length_bucket = 1024 hparams.dropout = 0.0 hparams.layer_prepostprocess_dropout = 0.0 hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.shared_embedding_and_softmax_weights = int(True) hparams.add_hparam("ffn_hidden_sizes", "2048") # Add new ones like this. hparams.moe_num_experts = 32 hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" hparams.add_hparam("layers", "timing," + "conv,att,ffn," * 2) # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none # moe params. local attention moe. hparams.add_hparam("attention_local", int(False)) hparams.add_hparam("attention_moe_k", 2) hparams.add_hparam("attention_num_experts", 16) hparams.add_hparam("attention_split_batch", int(False)) # Key, query and value dimensions for the attention hparams.add_hparam("attention_kq_size", 128) hparams.add_hparam("attention_v_size", 256) # Loss coef for load balancing hparams.add_hparam("attention_load_balance", 2e-2) hparams.add_hparam("diet_experts", int(False)) hparams.add_hparam("memory_efficient_ffn", int(False)) hparams.add_hparam("local_attention_window", 128) hparams.add_hparam("attention_num_groups", 8) hparams.add_hparam("attention_image_summary", int(True)) return hparams
def lstm_seq2seq(): """hparams for LSTM.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 hparams.initializer = "uniform_unit_scaling" return hparams
def transformer_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = int(True) # Add new ones like this. hparams.add_hparam("filter_size", 2048) # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers. hparams.add_hparam("num_encoder_layers", 0) hparams.add_hparam("num_decoder_layers", 0) # Attention-related flags. hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "conv_hidden_relu") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) hparams.add_hparam("use_pad_remover", int(True)) hparams.add_hparam("self_attention_type", "dot_product") hparams.add_hparam("max_relative_position", 0) return hparams
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(True) hparams.add_hparam("filter_size", 2048) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "conv_hidden_relu") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) # FLAGS RELATED TO MIXTURE-OF-EXPERTS # comma-separated list of layer numbers. # At each of these layers, we replace the ffn with a mixture of experts. hparams.add_hparam("moe_layers_encoder", "2") hparams.add_hparam("moe_layers_decoder", "2") return hparams
def multimodel_base(): """Base parameters for MultiModel.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.batch_size = 2048 hparams.num_hidden_layers = 4 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.dropout = 0.1 hparams.add_hparam("filter_size", 2048) # Add new ones like this. hparams.add_hparam("large_kernel_size", 15) hparams.add_hparam("attention_dropout", 0.1) hparams.add_hparam("num_heads", 8) hparams.add_hparam("moe_layers", "2") hparams.moe_num_experts = 30 return hparams
def slicenet_params1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 768 hparams.dropout = 0.5 hparams.symbol_dropout = 0.2 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 4 hparams.kernel_height = 3 hparams.kernel_width = 1 hparams.norm_type = "layer" hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("large_kernel_size", 15) # New ones are added like this. hparams.add_hparam("separability", -2) # A dilation scheme, one of _DILATION_SCHEMES. hparams.add_hparam("dilation_scheme", "1.1.1.1") # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size. hparams.add_hparam("kernel_scheme", "3.7.15.31") hparams.add_hparam("audio_compression", 8) # attention-related flags hparams.add_hparam("attention_type", "simple") hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("sim_loss_mult", 0.0) # Try 10.0 for experiments. hparams.add_hparam("attention_dropout", 0.2) hparams.shared_embedding_and_softmax_weights = int(True) return hparams
def gene_expression_conv_base(): """Hparams for GeneExpressionConv model.""" hparams = common_hparams.basic_params1() batch_size = 10 output_length = 2048 inputs_per_output = 128 chunk_size = 4 input_length = output_length * inputs_per_output // chunk_size hparams.batch_size = input_length * batch_size hparams.dropout = 0.1 hparams.add_hparam("num_conv_layers", 4) hparams.add_hparam("num_dconv_layers", 7) # The product of these pooling windows should match # input_length/target_length. hparams.add_hparam("pooling_windows", [2, 2, 2, 4]) hparams.hidden_size = 256 hparams.kernel_width = 20 hparams.add_hparam("stride", 1) return hparams
def neural_gpu_params1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.num_hidden_layers = 1 hparams.hidden_size = 256 hparams.dropout = 0.1 hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 10.0 hparams.num_hidden_layers = 1 hparams.kernel_height = 3 hparams.kernel_width = 1 hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.02 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 return hparams
def testLSTMSeq2Seq(self): vocab_size = 9 x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1)) y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1)) hparams = common_hparams.basic_params1() p_hparams = problem_hparams.test_problem_hparams( vocab_size, vocab_size) with self.test_session() as session: features = { "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } model = lstm.LSTMSeq2seq(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
def bytenet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 2048 hparams.hidden_size = 768 hparams.dropout = 0.2 hparams.symbol_dropout = 0.2 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 4 hparams.kernel_height = 3 hparams.kernel_width = 1 hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("num_block_repeat", 4) return hparams
def attention_lm_moe_base(): """Set of hyperparameters. suitable for 1 gpu. on lm1b_32k: ~229M params 0.9 steps/sec on [GeForce GTX TITAN X] Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 4 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(False) hparams.add_hparam("filter_size", 2048) # Add new ones like this. hparams.moe_num_experts = 32 # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("moe_layers", "2") # comma separated list of layer numbers # moe params. local attention moe. # If attention_layers is set, the num_hidden_layers parameter will be ignored # and each caracter of the string will correspond to one attention # layer type hparams.add_hparam("attention_layers", "") hparams.add_hparam("attention_type", AttentionType.MULTIHEAD) hparams.add_hparam("attention_local", int(False)) hparams.add_hparam("attention_moe_k", 2) hparams.add_hparam("attention_num_head", 1) hparams.add_hparam("attention_num_experts", 16) hparams.add_hparam("attention_split_batch", int(False)) # If attention_exp_factor is set, each input to local_expert_attention (of # dimensionality hidden size) is projected into attention_exp_factor smaller # inputs, each of dimensionality attention_exp_inputdim. (otherwise # attention_exp_inputdim is ignored) hparams.add_hparam("attention_exp_factor", 0) hparams.add_hparam("attention_exp_inputdim", 128) # Key, query and value dimensions for the attention hparams.add_hparam("attention_kq_size", 128) hparams.add_hparam("attention_v_size", 256) # Loss coef for load balancing hparams.add_hparam("attention_load_balance", 2e-2) hparams.add_hparam("use_sepconv", int(False)) hparams.add_hparam("diet_experts", int(False)) hparams.add_hparam("memory_efficient_ffn", int(False)) # if True, we learn a non-autoregressive model from "inputs" to "targets". # if False, we learn an autoregressive model to generate "targets" hparams.add_hparam("use_inputs", int(False)) return hparams