Beispiel #1
0
def argument():
    arg = optimize.argument()
    arg.dropout_type = 'vanilla'  # 'vanilla', 'broadcast', 'alpha'
    arg.ffd = 'transformer_ffd'  # 'transformer_ffd' 'sru' 'sepconv'
    arg.loss = 'sparse_softmax_cross_entropy_with_logits'
    arg.pos = 'timing'  # 'timing' 'emb' 'linear_stop' 'tanh_stop' 'exp_stop'

    arg.decoder_layers = 4  # the number of decoder layers
    arg.encoder_layers = 4  # the number of encoder layers
    arg.filter_size = 1024  # the filter size
    arg.head_size = 64  # the size of each head in the attention mechanisms
    arg.hidden_size = 256  # the hidden size
    arg.input_max_length = 10  # the maximum sequence size of the input, for the 'emb' pos
    arg.input_vocab_size = 1000  # the vocab size for the input
    arg.label_smoothing = 1.0  # the hyperparameter for label smoothing
    arg.max_relative_position = 100  # max relative position for relative attention
    arg.num_heads = 8  # the number of heads for the attention mechanisms
    arg.target_max_length = 10  # the maximum sequence size of the output, for the 'emb' pos
    arg.target_vocab_size = 1000  # the vocab size for the targets
    arg.weight_decay_hyperparameter = 0.001  # the hyperparameter for weight decay

    arg.adaptive_mask = False  # whether adaptive mask is used
    arg.classification = False  # whether the final output is a sequence, or single label
    arg.deparameterize = False  # KEEP AS FALSE
    arg.dynamic_attention_span = False  # KEEP AS FALSE
    arg.mask_loss = False  # whether parts of the loss is masked
    arg.relative_attention = False  # whether to use relative attention
    arg.unidirectional_decoder = True  # whether the decoder is unidirectional
    arg.unidirectional_encoder = False  # whether the encoder is unidirectional
    arg.use_decoder = True  # whether to use the decoder
    arg.use_mos = False  # whether to use an MoS
    arg.use_relu = True  # whether the activation functions are ReLU or GELU
    arg.weight_decay_regularization = False  # whether to use weight decay
    return arg
def argument():
    arg = optimize.argument()
    arg.dropout_type = 'vanilla'  # 'vanilla', 'broadcast', 'alpha'
    arg.ffd = 'transformer_ffd'  # 'transformer_ffd' 'sru' 'sepconv'
    arg.loss = 'sparse_softmax_cross_entropy_with_logits'
    arg.pos = 'timing'  # 'timing' 'emb' 'linear_stop' 'tanh_stop' 'exp_stop'

    arg.act_epsilon = 0.001  # a hyperparameter that the ACT mechanism uses
    arg.act_loss_weight = 0.01  # a hyperparameter specifing the auxillary ACT loss in comparison to the total model weight
    arg.filter_size = 1024  # the filter size
    arg.head_size = 64  # the size of each head
    arg.hidden_size = 256  # the hidden size of each model
    arg.input_max_length = 10  # the maximum size of the input sequence size
    arg.input_vocab_size = 1000  # the input vocab size
    arg.label_smoothing = 1.0  # the label smoothing hyperparameter
    arg.max_encoder_steps = 8  # the maximum number of encoder layers
    arg.max_decoder_steps = 8  # the maximum number of decoder layers
    arg.max_relative_position = 100  # used for relative attention
    arg.num_heads = 8  # the number of heads in a self-attention mechanism
    arg.target_max_length = 10  # the maximum size of the target sequence size
    arg.target_vocab_size = 1000  # the target vocab size
    arg.weight_decay_hyperparameter = 0.001  # the weight decay hyperparameter

    arg.classification = False  # whether the final output is a sequence, or single label
    arg.deparameterize = False  # KEEP AS FALSE
    arg.mask_loss = True  # whether parts of the loss is masked
    arg.relative_attention = False  # whether to use relative attention
    arg.unidirectional_decoder = True  # whether the decoder is unidirectional
    arg.unidirectional_encoder = False  # whether the encoder is unidirectional
    arg.use_act = False  # whether the Universal Transformer uses an ACT mechanism
    arg.use_decoder = True  # whether to use the decoder
    arg.use_mos = False  # whether to use an MoS
    arg.use_relu = True  # whether the activation functions are ReLU or GELU
    arg.weight_decay_regularization = False  # whether to use weight decay
    return arg
def argument():
  arg = optimize.argument()
  
  arg.dropout_type = 'vanilla'
  
  arg.embed_dim = 24
  arg.kernel_height = 1
  arg.kernel_width = 3
  arg.layer = 2
  arg.vocab_size = 20
  arg.width = 3
  return arg
Beispiel #4
0
def argument():
    arg = optimize.argument()
    arg.cell = 'gru'  # whether to use a GRU or LSTM model
    arg.loss = 'sparse_softmax_cross_entropy_with_logits'

    arg.hidden_dim = 256  # the hidden size of the model
    arg.label_smoothing = 1.0  # the hyperparameter for smoothing labels
    arg.layers = 2  # the number of layers for RNN
    arg.input_vocab_size = 1000  # the vocab size of the input sequence
    arg.target_vocab_size = 1000  # the target size of the target sequence
    arg.weight_decay_hyperparameter = 0.001  # the hyperparameter for weight decay

    arg.classification = True  # whether the output is a sequence or a single token
    arg.mask_loss = True  # whether to mask parts of the loss
    arg.unidirectional = True  # whether the anaylsis is strictly unidirectional
    arg.weight_decay_regularization = False  # whether weight decay is used

    arg.hidden_size = arg.hidden_dim
    return arg
Beispiel #5
0
def argument():
    arg = optimize.argument()
    arg.cell = 'gru'  # the type of RNN cell. Either GRU or LSTM
    arg.dropout_type = 'vanilla'  # dropout type is set to vanilla. There is not SELU activation function, so there is no reason to use alpha-dropout
    arg.loss = 'sparse_softmax_cross_entropy_with_logits'  # the loss function used
    arg.stop_feature = 'linear'  # the stop_feature used. 'linear' 'tanh' 'exp' 'none'

    arg.gamma = 0.1  # the gamma used for the stop-feature
    arg.hidden_dim = 128  # the hidden size
    arg.input_vocab_size = 83  # the input vocab size
    arg.label_smoothing = 1.0  # the label smoothing hyperparameter
    arg.layers = 2  # the number of RNN layers
    arg.target_vocab_size = 120  # the target vocab size
    arg.weight_decay_hyperparameter = 0.001  # the weight decay hyperparameter

    arg.mask_loss = True  # whether parts of the loss is masked
    arg.use_attention = True  # whether the output RNN cells use an attention mechanism
    arg.weight_decay_regularization = False  # whether weight decay is used
    return arg