def shakeshake_cifar10(): hparams = common_hparams.basic_params1() # This leads to effective batch size 128 when number of GPUs is 1 hparams.batch_size = 4096 * 8 hparams.hidden_size = 16 hparams.dropout = 0 hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 26 hparams.kernel_height = -1 # Unused hparams.kernel_width = -1 # Unused hparams.learning_rate_decay_scheme = "cosine" # Model should be run for 700000 steps with batch size 128 (~1800 epochs) hparams.learning_rate_cosine_cycle_steps = 700000 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 3000 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 # TODO(rshin): Adjust so that effective value becomes ~1e-4 hparams.weight_decay = 3.0 hparams.optimizer = "Momentum" hparams.optimizer_momentum_momentum = 0.9 hparams.add_hparam('base_filters', 16) hparams.add_hparam('shakeshake_type', 'batch') return hparams
def testNeuralGPU(self): hparams = common_hparams.basic_params1() batch_size = 3 input_length = 5 target_length = input_length input_vocab_size = 9 target_vocab_size = 11 p_hparams = problem_hparams.test_problem_hparams(hparams, input_vocab_size, target_vocab_size) inputs = -1 + np.random.random_integers( input_vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( target_vocab_size, size=(batch_size, target_length, 1, 1)) with self.test_session() as session: features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32) } model = neural_gpu.NeuralGPU(hparams, p_hparams) shadred_logits, _, _ = model.model_fn(features, True) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (batch_size, target_length, 1, 1, target_vocab_size))
def attention_lm_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 1.0 hparams.learning_rate_warmup_steps = 1000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = int(False) hparams.add_hparam("filter_size", 4096) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("residual_dropout", 0.1) return hparams
def bluenet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 4096 hparams.hidden_size = 256 hparams.dropout = 0.2 hparams.symbol_dropout = 0.5 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 8 hparams.kernel_height = 3 hparams.kernel_width = 3 hparams.learning_rate_decay_scheme = "exp10k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("imagenet_use_2d", True) hparams.add_hparam("anneal_until", 40000) hparams.add_hparam("batch_deviation_loss_factor", 5.0) return hparams
def testNeuralGPU(self): hparams = common_hparams.basic_params1() batch_size = 3 input_length = 5 target_length = input_length input_vocab_size = 9 target_vocab_size = 11 p_hparams = problem_hparams.test_problem_hparams(hparams, input_vocab_size, target_vocab_size) inputs = -1 + np.random.random_integers( input_vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( target_vocab_size, size=(batch_size, target_length, 1, 1)) with self.test_session() as session: features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32) } model = neural_gpu.NeuralGPU( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) shadred_logits, _, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (batch_size, target_length, 1, 1, target_vocab_size))
def long_answer_base(): """Set of hyperparameters. Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 8192 hparams.dropout = 0.0 hparams.batching_mantissa_bits = 3 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 1000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 4 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(True) hparams.sampling_method = "random" hparams.add_hparam("filter_size", 2048) # Add new ones like this. # comma-separated list of layer numbers. # At each of these layers, we replace the ffn with a mixture of experts. hparams.add_hparam("moe_layers", "2") # If moe_n2 is None, then use a flat MoE with moe_n1 experts. # If moe_n2 is an integer, then use a hierarchical MoE # consisting of moe_n1 groups of moe_n2 experts each. hparams.add_hparam("moe_n1", 64) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_hidden_size", 2048) hparams.add_hparam("moe_loss_coef", 1e-2) # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("residual_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("block_length", 512) hparams.add_hparam("answer_length_prob_train", 0.5) hparams.add_hparam("answer_length_infer", 1000) # We cannot handle long sequence at this point, so drop them, during eval. # This affects evaluation metrics. # TODO(noam): find a different workaround hparams.eval_drop_long_sequences = int(True) return hparams
def lstm_attention(): """hparams for LSTM with attention.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 # Attention hparams.add_hparam("attn_vec_size", hparams.hidden_size) return hparams
def attention_lm_moe_base(): """Set of hyperparameters. suitable for 1 gpu. on lm1b_16k: ~337M params 1.1 steps/sec on [GeForce GTX TITAN X] Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 1000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 4 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(False) hparams.add_hparam("filter_size", 2948) # Add new ones like this. # comma-separated list of layer numbers. # At each of these layers, we replace the ffn with a mixture of experts. hparams.add_hparam("moe_layers", "2") # If moe_n2 is None, then use a flat MoE with moe_n1 experts. # If moe_n2 is an integer, then use a hierarchical MoE # consisting of moe_n1 groups of moe_n2 experts each. hparams.add_hparam("moe_n1", 64) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_hidden_size", 2048) hparams.add_hparam("moe_loss_coef", 1e-2) # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none return hparams
def attention_lm_moe_base(): """Set of hyperparameters. suitable for 1 gpu. on lm1b_16k: ~337M params 1.1 steps/sec on [GeForce GTX TITAN X] Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 1000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 4 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(False) hparams.add_hparam("filter_size", 2948) # Add new ones like this. # comma-separated list of layer numbers. # At each of these layers, we replace the ffn with a mixture of experts. hparams.add_hparam("moe_layers", "2") # If moe_n2 is None, then use a flat MoE with moe_n1 experts. # If moe_n2 is an integer, then use a hierarchical MoE # consisting of moe_n1 groups of moe_n2 experts each. hparams.add_hparam("moe_n1", 64) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_hidden_size", 2048) hparams.add_hparam("moe_loss_coef", 1e-2) # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("residual_dropout", 0.1) return hparams
def testLSTMSeq2Seq(self): vocab_size = 9 x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1)) y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1)) hparams = common_hparams.basic_params1() p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size, vocab_size) with self.test_session() as session: features = { "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } model = baseline.LSTMSeq2Seq(hparams, p_hparams) sharded_logits, _, _ = model.model_fn(features, True) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
def slicenet_params1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 768 hparams.dropout = 0.5 hparams.symbol_dropout = 0.2 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 4 hparams.kernel_height = 3 hparams.kernel_width = 1 hparams.add_hparam("normalizer_fn", "layer") # New ones are added like this. hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("large_kernel_size", 15) hparams.add_hparam("separability", -2) # A dilation scheme, one of _DILATION_SCHEMES. hparams.add_hparam("dilation_scheme", "1.1.1.1") # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size. hparams.add_hparam("kernel_scheme", "3.7.15.31") hparams.add_hparam("audio_compression", 8) hparams.add_hparam("moe_n1", 32) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_loss_coef", 1e-2) hparams.add_hparam("imagenet_use_2d", int(True)) # attention-related flags hparams.add_hparam("attention_type", "simple") hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("sim_loss_mult", 0.0) # Try 10.0 for experiments. hparams.add_hparam("attention_dropout", 0.2) hparams.shared_embedding_and_softmax_weights = int(True) return hparams
def multimodel_base(): """Base parameters for MultiModel.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.batch_size = 2048 hparams.num_hidden_layers = 4 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.dropout = 0.1 hparams.add_hparam("filter_size", 2048) # Add new ones like this. hparams.add_hparam("large_kernel_size", 15) hparams.add_hparam("attention_dropout", 0.1) hparams.add_hparam("num_heads", 8) hparams.add_hparam("moe_n1", 30) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_layers", "2") hparams.add_hparam("moe_loss_coef", 1e-2) hparams.add_hparam("imagenet_use_2d", int(True)) return hparams
def transformer_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = int(True) hparams.add_hparam("filter_size", 2048) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "conv_hidden_relu") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) return hparams
def neural_gpu_params1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.num_hidden_layers = 1 hparams.hidden_size = 256 hparams.dropout = 0.1 hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 10.0 hparams.num_hidden_layers = 1 hparams.kernel_height = 3 hparams.kernel_width = 1 hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.02 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 return hparams
def bytenet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 2048 hparams.hidden_size = 768 hparams.dropout = 0.2 hparams.symbol_dropout = 0.2 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 4 hparams.kernel_height = 3 hparams.kernel_width = 1 hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("num_block_repeat", 4) return hparams
def xception_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 4096 hparams.hidden_size = 768 hparams.dropout = 0.2 hparams.symbol_dropout = 0.2 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 8 hparams.kernel_height = 3 hparams.kernel_width = 3 hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("imagenet_use_2d", True) return hparams
def transformer_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = int(True) hparams.add_hparam("filter_size", 2048) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "conv_hidden_relu") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("nbr_decoder_problems", 1) return hparams