Beispiel #1
0
 def __init__(self,
              num_hidden: int,
              prefix: str = 'lngru_',
              params: Optional[mx.rnn.RNNParams] = None,
              norm_scale: float = 1.0,
              norm_shift: float = 0.0) -> None:
     super(LayerNormGRUCell, self).__init__(num_hidden, prefix, params)
     self._iN = LayerNormalization(
         num_hidden=num_hidden * 3,
         prefix="%si2h" % self._prefix,
         scale=self.params.get('i2h_scale',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_scale)),
         shift=self.params.get('i2h_shift',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_shift)))
     self._hN = LayerNormalization(
         num_hidden=num_hidden * 3,
         prefix="%sh2h" % self._prefix,
         scale=self.params.get('h2h_scale',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_scale)),
         shift=self.params.get('h2h_shift',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_shift)))
     self._shape_fix = None
Beispiel #2
0
    def __init__(self,
                 input_previous_word: bool,
                 attention_num_hidden: int,
                 attention_coverage_type: Optional[str] = None,
                 attention_coverage_num_hidden: int = 1,
                 prefix='',
                 layer_normalization: bool = False) -> None:
        dynamic_source_num_hidden = 1 if attention_coverage_type is None else attention_coverage_num_hidden
        super().__init__(input_previous_word=input_previous_word,
                         dynamic_source_num_hidden=dynamic_source_num_hidden)
        self.prefix = prefix
        self.attention_num_hidden = attention_num_hidden
        # input (encoder) to hidden
        self.att_e2h_weight = mx.sym.Variable("%satt_e2h_weight" % prefix)
        # input (query) to hidden
        self.att_q2h_weight = mx.sym.Variable("%satt_q2h_weight" % prefix)
        # hidden to score
        self.att_h2s_weight = mx.sym.Variable("%satt_h2s_weight" % prefix)
        # dynamic source (coverage) weights and settings
        # input (coverage) to hidden
        self.att_c2h_weight = mx.sym.Variable(
            "%satt_c2h_weight" % prefix) if attention_coverage_type else None
        self.coverage = sockeye.coverage.get_coverage(
            attention_coverage_type, dynamic_source_num_hidden,
            layer_normalization) if attention_coverage_type else None

        if layer_normalization:
            self._ln = LayerNormalization(num_hidden=attention_num_hidden,
                                          prefix="att_norm")
        else:
            self._ln = None
Beispiel #3
0
 def __init__(self,
              num_hidden: int,
              prefix: str = 'lnlstm_',
              params: Optional[mx.rnn.RNNParams] = None,
              forget_bias: float = 1.0,
              norm_scale: float = 1.0,
              norm_shift: float = 0.0) -> None:
     super(LayerNormLSTMCell, self).__init__(num_hidden, prefix, params, forget_bias)
     self._iN = LayerNormalization(prefix="%si2h" % self._prefix,
                                   scale=self.params.get('i2h_scale', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_scale)),
                                   shift=self.params.get('i2h_shift', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_shift)))
     self._hN = LayerNormalization(prefix="%sh2h" % self._prefix,
                                   scale=self.params.get('h2h_scale', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_scale)),
                                   shift=self.params.get('h2h_shift', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_shift)))
     self._cN = LayerNormalization(prefix="%sc" % self._prefix,
                                   scale=self.params.get('c_scale', shape=(num_hidden,), init=mx.init.Constant(value=norm_scale)),
                                   shift=self.params.get('c_shift', shape=(num_hidden,), init=mx.init.Constant(value=norm_shift)))
Beispiel #4
0
    def __init__(self,
                 config: RecurrentDecoderConfig,
                 attention: attentions.Attention,
                 lexicon: Optional[lexicons.Lexicon] = None,
                 prefix=C.DECODER_PREFIX) -> None:
        # TODO: implement variant without input feeding
        self.rnn_config = config.rnn_config
        self.target_vocab_size = config.vocab_size
        self.num_target_embed = config.num_embed
        self.attention = attention
        self.weight_tying = config.weight_tying
        self.context_gating = config.context_gating
        self.layer_norm = config.layer_normalization
        self.lexicon = lexicon
        self.prefix = prefix

        self.num_hidden = self.rnn_config.num_hidden

        if self.context_gating:
            self.gate_w = mx.sym.Variable("%sgate_weight" % prefix)
            self.gate_b = mx.sym.Variable("%sgate_bias" % prefix)
            self.mapped_rnn_output_w = mx.sym.Variable(
                "%smapped_rnn_output_weight" % prefix)
            self.mapped_rnn_output_b = mx.sym.Variable(
                "%smapped_rnn_output_bias" % prefix)
            self.mapped_context_w = mx.sym.Variable("%smapped_context_weight" %
                                                    prefix)
            self.mapped_context_b = mx.sym.Variable("%smapped_context_bias" %
                                                    prefix)

        # Stacked RNN
        self.rnn = rnn.get_stacked_rnn(self.rnn_config, self.prefix)
        # RNN init state parameters
        self._create_layer_parameters()

        # Hidden state parameters
        self.hidden_w = mx.sym.Variable("%shidden_weight" % prefix)
        self.hidden_b = mx.sym.Variable("%shidden_bias" % prefix)
        self.hidden_norm = LayerNormalization(
            self.num_hidden, prefix="%shidden_norm" %
            prefix) if self.layer_norm else None
        # Embedding & output parameters
        self.embedding = encoder.Embedding(self.num_target_embed,
                                           self.target_vocab_size,
                                           prefix=C.TARGET_EMBEDDING_PREFIX,
                                           dropout=0.)  # TODO dropout?
        if self.weight_tying:
            check_condition(
                self.num_hidden == self.num_target_embed,
                "Weight tying requires target embedding size and rnn_num_hidden to be equal"
            )
            self.cls_w = self.embedding.embed_weight
        else:
            self.cls_w = mx.sym.Variable("%scls_weight" % prefix)
        self.cls_b = mx.sym.Variable("%scls_bias" % prefix)
Beispiel #5
0
 def __init__(self,
              num_hidden: int,
              prefix: str = 'lnggru_',
              params: Optional[mx.rnn.RNNParams] = None,
              norm_scale: float = 1.0,
              norm_shift: float = 0.0) -> None:
     super(LayerNormPerGateGRUCell, self).__init__(num_hidden, prefix, params)
     self._norm_layers = list()  # type: List[LayerNormalization]
     for name in ['r', 'z', 'o']:
         scale = self.params.get('%s_shift' % name, init=mx.init.Constant(value=norm_shift))
         shift = self.params.get('%s_scale' % name, init=mx.init.Constant(value=norm_scale))
         self._norm_layers.append(LayerNormalization(prefix="%s%s" % (self._prefix, name), scale=scale, shift=shift))
Beispiel #6
0
 def __init__(self,
              num_hidden: int,
              prefix: str = 'lnglstm_',
              params: Optional[mx.rnn.RNNParams] = None,
              forget_bias: float = 1.0,
              norm_scale: float = 1.0,
              norm_shift: float = 0.0) -> None:
     super(LayerNormPerGateLSTMCell, self).__init__(num_hidden, prefix, params, forget_bias)
     self._norm_layers = list()  # type: List[LayerNormalization]
     for name in ['i', 'f', 'c', 'o', 's']:
         scale = self.params.get('%s_shift' % name,
                                 init=mx.init.Constant(value=norm_shift))
         shift = self.params.get('%s_scale' % name,
                                 init=mx.init.Constant(value=norm_scale if name != "f" else forget_bias))
         self._norm_layers.append(
             LayerNormalization(prefix="%s%s" % (self._prefix, name), scale=scale, shift=shift))
Beispiel #7
0
 def _create_layer_parameters(self):
     """
     Creates parameters for encoder last state transformation into decoder layer initial states.
     """
     self.init_ws, self.init_bs = [], []
     self.init_norms = []
     for state_idx, (_, init_num_hidden) in enumerate(self.rnn.state_shape):
         self.init_ws.append(
             mx.sym.Variable("%senc2decinit_%d_weight" %
                             (self.prefix, state_idx)))
         self.init_bs.append(
             mx.sym.Variable("%senc2decinit_%d_bias" %
                             (self.prefix, state_idx)))
         if self.layer_norm:
             self.init_norms.append(
                 LayerNormalization(num_hidden=init_num_hidden,
                                    prefix="%senc2decinit_%d_norm" %
                                    (self.prefix, state_idx)))
Beispiel #8
0
 def __init__(self,
              coverage_num_hidden: int,
              activation: str,
              layer_normalization: bool) -> None:
     super().__init__()
     self.activation = activation
     self.num_hidden = coverage_num_hidden
     # input (encoder) to hidden
     self.cov_e2h_weight = mx.sym.Variable("%se2h_weight" % self.prefix)
     # decoder to hidden
     self.cov_dec2h_weight = mx.sym.Variable("%si2h_weight" % self.prefix)
     # previous coverage to hidden
     self.cov_prev2h_weight = mx.sym.Variable("%sprev2h_weight" % self.prefix)
     # attention scores to hidden
     self.cov_a2h_weight = mx.sym.Variable("%sa2h_weight" % self.prefix)
     # optional layer normalization
     self.layer_norm = None
     if layer_normalization and not self.num_hidden != 1:
         self.layer_norm = LayerNormalization(self.num_hidden,
                                              prefix="%snorm" % self.prefix) if layer_normalization else None
Beispiel #9
0
    def __init__(self,
                 num_hidden: int,
                 attention: sockeye.attention.Attention,
                 target_vocab_size: int,
                 num_target_embed: int,
                 num_layers=1,
                 prefix=C.DECODER_PREFIX,
                 weight_tying=False,
                 dropout=0.0,
                 cell_type: str = C.LSTM_TYPE,
                 residual: bool = False,
                 forget_bias: float = 0.0,
                 lexicon: Optional[sockeye.lexicon.Lexicon] = None,
                 context_gating: bool = False,
                 layer_normalization: bool = False) -> None:
        # TODO: implement variant without input feeding
        self.num_layers = num_layers
        self.prefix = prefix
        self.dropout = dropout
        self.num_hidden = num_hidden
        self.attention = attention
        self.target_vocab_size = target_vocab_size
        self.num_target_embed = num_target_embed
        self.context_gating = context_gating
        if self.context_gating:
            self.gate_w = mx.sym.Variable("%sgate_weight" % prefix)
            self.gate_b = mx.sym.Variable("%sgate_bias" % prefix)
            self.mapped_rnn_output_w = mx.sym.Variable(
                "%smapped_rnn_output_weight" % prefix)
            self.mapped_rnn_output_b = mx.sym.Variable(
                "%smapped_rnn_output_bias" % prefix)
            self.mapped_context_w = mx.sym.Variable("%smapped_context_weight" %
                                                    prefix)
            self.mapped_context_b = mx.sym.Variable("%smapped_context_bias" %
                                                    prefix)
        self.layer_norm = layer_normalization

        # Decoder stacked RNN
        self.rnn = sockeye.rnn.get_stacked_rnn(cell_type, num_hidden,
                                               num_layers, dropout, prefix,
                                               residual, forget_bias)

        # Decoder parameters
        # RNN init state parameters
        self._create_layer_parameters()

        # Hidden state parameters
        self.hidden_w = mx.sym.Variable("%shidden_weight" % prefix)
        self.hidden_b = mx.sym.Variable("%shidden_bias" % prefix)
        self.hidden_norm = LayerNormalization(
            self.num_hidden, prefix="%shidden_norm" %
            prefix) if self.layer_norm else None
        # Embedding & output parameters
        self.embedding = sockeye.encoder.Embedding(
            self.num_target_embed,
            self.target_vocab_size,
            prefix=C.TARGET_EMBEDDING_PREFIX,
            dropout=0.)  # TODO dropout?
        if weight_tying:
            check_condition(
                self.num_hidden == self.num_target_embed,
                "Weight tying requires target embedding size and rnn_num_hidden to be equal"
            )
            self.cls_w = self.embedding.embed_weight
        else:
            self.cls_w = mx.sym.Variable("%scls_weight" % prefix)
        self.cls_b = mx.sym.Variable("%scls_bias" % prefix)

        self.lexicon = lexicon