Exemple #1
0
    def __init__(self,
                 vocab_size=32128,
                 d_model=768,
                 d_kv=64,
                 d_ff=3072,
                 num_layers=12,
                 num_heads=12,
                 dropout_prob=0.1,
                 layer_norm_eps=1E-6,
                 activation='relu',
                 init_factor=1.0,
                 layout='NT',
                 dtype='float32'):
        super().__init__()
        assert vocab_size > 0, 'Vocab size {} is not valid.'.format(vocab_size)
        self._vocab_size = vocab_size
        self._d_model = d_model
        self._d_kv = d_kv
        self._d_ff = d_ff
        self._num_layers = num_layers
        self._num_heads = num_heads
        self._inner_dim = num_heads * d_kv
        self._activation = activation
        self._init_factor = init_factor
        self._dtype = dtype
        assert layout in ['TN', 'NT'], \
            'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout)
        self._layout = layout
        self._time_axis = 1 if self.layout == 'NT' else 0

        # input embedding weights are shared between across encoder and decoder
        self.input_embedding_layer = nn.Embedding(input_dim=vocab_size,
                                                  output_dim=d_model,
                                                  weight_initializer=Normal(
                                                      1.0 * init_factor),
                                                  dtype=dtype)
        self.encoder = T5Encoder(d_model=d_model,
                                 d_kv=d_kv,
                                 d_ff=d_ff,
                                 num_layers=num_layers,
                                 num_heads=num_heads,
                                 dropout_prob=dropout_prob,
                                 layer_norm_eps=layer_norm_eps,
                                 activation=activation,
                                 init_factor=init_factor,
                                 layout=layout,
                                 dtype=dtype)
        self.decoder = T5Decoder(d_model=d_model,
                                 d_kv=d_kv,
                                 d_ff=d_ff,
                                 num_layers=num_layers,
                                 num_heads=num_heads,
                                 dropout_prob=dropout_prob,
                                 layer_norm_eps=layer_norm_eps,
                                 activation=activation,
                                 init_factor=init_factor,
                                 layout=layout,
                                 dtype=dtype)
Exemple #2
0
    def __init__(self,
                 d_model,
                 d_kv,
                 d_ff,
                 num_layers=12,
                 num_heads=12,
                 dropout_prob=0.1,
                 layer_norm_eps=1E-6,
                 activation='relu',
                 init_factor=1.0,
                 layout='NT',
                 dtype='float32'):
        super().__init__()
        self._d_model = d_model
        self._d_kv = d_kv
        self._d_ff = d_ff
        self._num_layers = num_layers
        self._num_heads = num_heads
        self._inner_dim = num_heads * d_kv
        self._dtype = dtype
        assert layout in ['TN', 'NT'], \
            'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout)
        self._layout = layout
        self._time_axis = 1 if self.layout == 'NT' else 0

        self.relative_position_encoder = RelAttentionScoreCell(
            query_units=self._inner_dim,
            num_heads=num_heads,
            method='t5',
            bidirectional=False,
            embed_initializer=Normal(d_model**-0.5 * init_factor),
            layout='NTK' if layout == 'NT' else 'TNK',
            dtype=dtype)
        self.layers = nn.HybridSequential()
        for _ in range(num_layers):
            self.layers.add(
                T5Block(d_model=d_model,
                        d_kv=d_kv,
                        d_ff=d_ff,
                        is_decoder=True,
                        num_heads=num_heads,
                        dropout_prob=dropout_prob,
                        layer_norm_eps=layer_norm_eps,
                        activation=activation,
                        init_factor=init_factor,
                        layout=layout,
                        dtype=dtype))
        self.final_layer_norm = RMSNorm(in_channels=d_model,
                                        center=False,
                                        scale=True,
                                        gamma_initializer=Constant(
                                            1.0 * init_factor),
                                        variance_epsilon=layer_norm_eps,
                                        dtype=dtype)
        self.dropout = nn.Dropout(dropout_prob)
Exemple #3
0
    def __init__(self,
                 d_model,
                 d_kv,
                 d_ff,
                 is_decoder,
                 num_heads=12,
                 dropout_prob=0.1,
                 layer_norm_eps=1E-6,
                 activation='relu',
                 init_factor=1.0,
                 layout='NT',
                 dtype='float32'):
        super().__init__()
        self._d_model = d_model
        self._d_kv = d_kv
        self._d_ff = d_ff
        self._is_decoder = is_decoder
        self._num_heads = num_heads
        self._inner_dim = self._num_heads * self._d_kv
        self._dtype = dtype
        assert layout in ['TN', 'NT'], \
            'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout)
        self._layout = layout
        self._time_axis = 1 if self.layout == 'NT' else 0

        self.self_attn_layer_norm = RMSNorm(in_channels=d_model,
                                            center=False,
                                            scale=True,
                                            gamma_initializer=Constant(
                                                1.0 * init_factor),
                                            variance_epsilon=layer_norm_eps,
                                            dtype=dtype)
        # avoid scaling before softmax
        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
        self.self_attn_q = nn.Dense(units=self._inner_dim,
                                    in_units=d_model,
                                    flatten=False,
                                    use_bias=False,
                                    weight_initializer=Normal(
                                        (d_model * d_kv)**-0.5 * init_factor),
                                    dtype=dtype)
        self.self_attn_k = nn.Dense(units=self._inner_dim,
                                    in_units=d_model,
                                    flatten=False,
                                    use_bias=False,
                                    weight_initializer=Normal(d_model**-0.5 *
                                                              init_factor),
                                    dtype=dtype)
        self.self_attn_v = nn.Dense(units=self._inner_dim,
                                    in_units=d_model,
                                    flatten=False,
                                    use_bias=False,
                                    weight_initializer=Normal(d_model**-0.5 *
                                                              init_factor),
                                    dtype=dtype)
        self.self_attn = MultiHeadAttentionCell(
            query_units=self._inner_dim,
            num_heads=num_heads,
            attention_dropout=dropout_prob,
            scaled=False,
            normalized=False,
            dtype=dtype,
            layout='NTK' if layout == 'NT' else 'TNK',
            use_einsum=False)
        self.self_attn_proj = nn.Dense(
            units=d_model,
            in_units=self._inner_dim,
            flatten=False,
            use_bias=False,
            weight_initializer=Normal(self._inner_dim**-0.5 * init_factor),
            dtype=dtype)
        if is_decoder:
            self.cross_attn_layer_norm = RMSNorm(
                in_channels=d_model,
                center=False,
                scale=True,
                gamma_initializer=Constant(1.0 * init_factor),
                variance_epsilon=layer_norm_eps,
                dtype=dtype)
            # avoid scaling before softmax
            self.cross_attn_q = nn.Dense(
                units=self._inner_dim,
                in_units=d_model,
                flatten=False,
                use_bias=False,
                weight_initializer=Normal(
                    (d_model * d_kv)**-0.5 * init_factor),
                dtype=dtype)
            self.cross_attn_k = nn.Dense(units=self._inner_dim,
                                         in_units=d_model,
                                         flatten=False,
                                         use_bias=False,
                                         weight_initializer=Normal(
                                             d_model**-0.5 * init_factor),
                                         dtype=dtype)
            self.cross_attn_v = nn.Dense(units=self._inner_dim,
                                         in_units=d_model,
                                         flatten=False,
                                         use_bias=False,
                                         weight_initializer=Normal(
                                             d_model**-0.5 * init_factor),
                                         dtype=dtype)
            self.cross_attn = MultiHeadAttentionCell(
                query_units=self._inner_dim,
                num_heads=num_heads,
                attention_dropout=dropout_prob,
                scaled=False,
                normalized=False,
                dtype=dtype,
                layout='NTK' if layout == 'NT' else 'TNK',
                use_einsum=False)
            self.cross_attn_proj = nn.Dense(
                units=d_model,
                in_units=self._inner_dim,
                flatten=False,
                use_bias=False,
                weight_initializer=Normal(self._inner_dim**-0.5 * init_factor),
                dtype=dtype)
        assert activation in ['relu', 'gated-gelu'], \
            '{} is not supported. Please choose from "relu" and "gated-gelu"'.format(activation)
        # the weight_initializer here is equivalent to Normal(in_units ** -0.5 * init_factor)
        self.ffn = PositionwiseFFN(
            units=d_model,
            hidden_size=d_ff,
            use_bias=False,
            activation_dropout=dropout_prob,
            dropout=dropout_prob,
            weight_initializer=Xavier('gaussian', 'in', np.sqrt(init_factor)),
            activation='relu' if activation == 'relu' else 'gelu(tanh)',
            use_gated_activation=False if activation == 'relu' else True,
            normalization='rms_norm',
            layer_norm_eps=layer_norm_eps,
            pre_norm=True,
            dtype=dtype,
            center=False,
            scale=True,
            gamma_initializer=Constant(1.0 * init_factor))
        self.dropout = nn.Dropout(dropout_prob)
Exemple #4
0
    def __init__(self, **kwargs):
        super(QANet, self).__init__(**kwargs)
        with self.name_scope():
            self.flatten = gluon.nn.Flatten()
            self.dropout = gluon.nn.Dropout(opt.layers_dropout)
            self.char_conv = ConvolutionalEncoder(
                embed_size=opt.char_emb_dim,
                num_filters=opt.char_conv_filters,
                ngram_filter_sizes=opt.char_conv_ngrams,
                conv_layer_activation=None,
                num_highway=0)

        self.highway = gluon.nn.HybridSequential()
        with self.highway.name_scope():
            self.highway.add(
                gluon.nn.Dense(units=opt.emb_encoder_conv_channels,
                               flatten=False,
                               use_bias=False,
                               weight_initializer=Xavier()))
            self.highway.add(
                Highway(input_size=opt.emb_encoder_conv_channels,
                        num_layers=opt.highway_layers,
                        activation='relu',
                        highway_bias=HighwayBias(nonlinear_transform_bias=0.0,
                                                 transform_gate_bias=0.0)))

        self.word_emb = gluon.nn.HybridSequential()
        with self.word_emb.name_scope():
            self.word_emb.add(
                gluon.nn.Embedding(input_dim=opt.word_corpus,
                                   output_dim=opt.word_emb_dim))
            self.word_emb.add(gluon.nn.Dropout(rate=opt.word_emb_dropout))
        self.char_emb = gluon.nn.HybridSequential()
        with self.char_emb.name_scope():
            self.char_emb.add(
                gluon.nn.Embedding(input_dim=opt.character_corpus,
                                   output_dim=opt.char_emb_dim,
                                   weight_initializer=Normal(sigma=0.1)))
            self.char_emb.add(gluon.nn.Dropout(rate=opt.char_emb_dropout))

        with self.name_scope():
            self.emb_encoder = Encoder(
                kernel_size=opt.emb_encoder_conv_kernerl_size,
                num_filters=opt.emb_encoder_conv_channels,
                conv_layers=opt.emb_encoder_num_conv_layers,
                num_heads=opt.emb_encoder_num_head,
                num_blocks=opt.emb_encoder_num_block)

            self.project = gluon.nn.Dense(units=opt.emb_encoder_conv_channels,
                                          flatten=False,
                                          use_bias=False,
                                          weight_initializer=Xavier())

        with self.name_scope():
            self.co_attention = CoAttention()

        with self.name_scope():
            self.model_encoder = Encoder(
                kernel_size=opt.model_encoder_conv_kernel_size,
                num_filters=opt.model_encoder_conv_channels,
                conv_layers=opt.model_encoder_conv_layers,
                num_heads=opt.model_encoder_num_head,
                num_blocks=opt.model_encoder_num_block)

        with self.name_scope():
            self.predict_begin = gluon.nn.Dense(
                units=1,
                use_bias=True,
                flatten=False,
                weight_initializer=Xavier(rnd_type='uniform',
                                          factor_type='in',
                                          magnitude=1),
                bias_initializer=Uniform(1.0 /
                                         opt.model_encoder_conv_channels))
            self.predict_end = gluon.nn.Dense(
                units=1,
                use_bias=True,
                flatten=False,
                weight_initializer=Xavier(rnd_type='uniform',
                                          factor_type='in',
                                          magnitude=1),
                bias_initializer=Uniform(1.0 /
                                         opt.model_encoder_conv_channels))
Exemple #5
0
def net_initialize(net,
                   model_ctx,
                   initializer: (str, Initializer, dict,
                                 list) = mx.init.Xavier(),
                   select=None,
                   logger=logging,
                   verbose=False,
                   force_reinit=False):
    """
    初始化网络参数

    Parameters
    ----------
    net
    model_ctx: mx.cpu or mx.gpu
    initializer: str, Initializer, dict or list, tuple
    select
    logger
    verbose : bool, default False
        Whether to verbosely print out details on initialization.
    force_reinit : bool, default False
        Whether to force re-initialization if parameter is already initialized.
    Notes
    ------
    The developer who modify this document should simultaneously modify the related function in glue

    Examples
    --------
    >>> import mxnet as mx
    >>> from mxnet import gluon
    >>> emb = gluon.nn.Embedding(2, 3)
    >>> net_initialize(emb, mx.cpu())
    >>> emb.weight.data()
    <BLANKLINE>
    [[0.10694504 0.2034123  0.4714563 ]
     [0.7542485  0.2251432  0.7842196 ]]
    <NDArray 2x3 @cpu(0)>
    >>> emb1 = gluon.nn.Embedding(2, 3)
    >>> net_initialize(emb1, mx.cpu(), initializer=mx.init.Xavier())
    >>> emb1.weight.data()
    <BLANKLINE>
    [[ 0.09833419  0.76079047 -0.16726398]
     [ 0.27071452  0.319638   -0.25330698]]
    <NDArray 2x3 @cpu(0)>
    >>> class EmbNet(gluon.nn.HybridBlock):
    ...     def __init__(self, prefix=None, params=None):
    ...         super(EmbNet, self).__init__(prefix, params)
    ...         with self.name_scope():
    ...             self.emb = gluon.nn.Embedding(2, 3)
    ...             self.linear = gluon.nn.Dense(4)
    ...     def hybrid_forward(self, F, x, *args, **kwargs):
    ...         return self.linear(self.emb(x))
    >>> net = EmbNet()
    >>> from longling.ML.DL import BLOCK_EMBEDDING
    >>> net_initialize(net, mx.cpu(), initializer={BLOCK_EMBEDDING: "xaiver", ".*embedding": "uniform"})
    >>> net(mx.nd.array([0, 1]))
    <BLANKLINE>
    [[ 0.03268543 -0.00860071  0.04774952  0.00056277]
     [-0.00648303 -0.03121923 -0.04578817 -0.08059631]]
    <NDArray 2x4 @cpu(0)>
    >>> net1 = EmbNet()
    >>> net_initialize(net1, mx.cpu(), initializer=["xaiver", "uniform"], select=[BLOCK_EMBEDDING, ".*embedding"])
    >>> net1(mx.nd.array([0, 1]))  # doctest: +ELLIPSIS
    <BLANKLINE>
    [[-0.0896... -0.0179... -0.0156... -0.0136...]
     [ 0.0033...  0.0255...  0.0111...  0.0446...]]
    <NDArray 2x4 @cpu(0)>
    >>> net_initialize(net1, mx.cpu(), initializer=[(BLOCK_EMBEDDING, "xaiver"), (".*embedding", "uniform")],
    ...     force_reinit=True)
    >>> net1(mx.nd.array([0, 1]))  # doctest: +ELLIPSIS
    <BLANKLINE>
    [[ 0.0153...  0.0266... -0.0466...  0.0291...]
     [-0.0362...  0.0063...  0.0227... -0.0212...]]
    <NDArray 2x4 @cpu(0)>
    """
    if isinstance(initializer, str):
        initializer = {
            "xaiver": Xavier(),
            "uniform": Uniform(),
            "normal": Normal()
        }[initializer]
    elif isinstance(initializer, dict):
        for _select, _initializer in initializer.items():
            net_initialize(net,
                           model_ctx=model_ctx,
                           initializer=_initializer,
                           select=_select,
                           logger=logger,
                           verbose=verbose,
                           force_reinit=force_reinit)
        return
    elif isinstance(initializer, (list, tuple)):
        if select is not None:
            assert len(select) == len(initializer)
            for _select, _initializer in zip(select, initializer):
                net_initialize(net,
                               model_ctx=model_ctx,
                               initializer=_initializer,
                               select=_select,
                               logger=logger,
                               verbose=verbose,
                               force_reinit=force_reinit)
        else:
            for _select, _initializer in initializer:
                net_initialize(net,
                               model_ctx=model_ctx,
                               initializer=_initializer,
                               select=_select,
                               logger=logger,
                               verbose=verbose,
                               force_reinit=force_reinit)
        return
    elif initializer is None or isinstance(initializer, Initializer):
        pass
    else:
        raise TypeError(
            "initializer should be either str or Initializer, now is",
            type(initializer))

    logger.info("initializer: %s, select: %s, ctx: %s" %
                (initializer, select, model_ctx))
    net.collect_params(select).initialize(initializer,
                                          ctx=model_ctx,
                                          verbose=verbose,
                                          force_reinit=force_reinit)
Exemple #6
0
    def __init__(self, **kwargs):
        super(QANet, self).__init__(**kwargs)
        with self.name_scope():
            self.flatten = gluon.nn.Flatten()
            self.dropout = gluon.nn.Dropout(LAYERS_DROPOUT)
            self.char_conv = gluon.nn.Conv1D(
                channels=EMB_ENCODER_CONV_CHANNELS,
                kernel_size=5,
                activation='relu',
                weight_initializer=MSRAPrelu(),
                use_bias=True,
                padding=5 // 2
            )

        self.highway = gluon.nn.HybridSequential()
        with self.highway.name_scope():
            self.highway.add(
                gluon.nn.Dense(
                    units=EMB_ENCODER_CONV_CHANNELS,
                    flatten=False,
                    use_bias=False,
                    weight_initializer=Xavier()
                )
            )
            self.highway.add(
                Highway(
                    input_size=EMB_ENCODER_CONV_CHANNELS,
                    num_layers=NUM_HIGHWAY_LAYERS
                )
            )

        self.word_emb = gluon.nn.HybridSequential()
        with self.word_emb.name_scope():
            self.word_emb.add(
                gluon.nn.Embedding(
                    input_dim=CORPUS_WORDS,
                    output_dim=DIM_WORD_EMBED
                )
            )
            self.word_emb.add(
                gluon.nn.Dropout(rate=WORD_EMBEDDING_DROPOUT)
            )
        self.char_emb = gluon.nn.HybridSequential()
        with self.char_emb.name_scope():
            self.char_emb.add(
                gluon.nn.Embedding(
                    input_dim=CORPUS_CHARACTERS,
                    output_dim=DIM_CHAR_EMBED,
                    weight_initializer=Normal(sigma=0.1)
                )
            )
            self.char_emb.add(
                gluon.nn.Dropout(rate=CHAR_EMBEDDING_DROPOUT)
            )

        with self.name_scope():
            self.emb_encoder = Encoder(
                kernel_size=EMB_ENCODER_CONV_KERNEL_SIZE,
                num_filters=EMB_ENCODER_CONV_CHANNELS,
                conv_layers=EMB_ENCODER_NUM_CONV_LAYERS,
                num_heads=EMB_ENCODER_NUM_HEAD,
                num_blocks=EMB_ENCODER_NUM_BLOCK
            )

            self.project = gluon.nn.Dense(
                units=EMB_ENCODER_CONV_CHANNELS,
                flatten=False,
                use_bias=False,
                weight_initializer=Xavier()
            )

        with self.name_scope():
            self.co_attention = CoAttention()

        with self.name_scope():
            self.model_encoder = Encoder(
                kernel_size=MODEL_ENCODER_CONV_KERNEL_SIZE,
                num_filters=MODEL_ENCODER_CONV_CHANNELS,
                conv_layers=MODEL_ENCODER_NUM_CONV_LAYERS,
                num_heads=MODEL_ENCODER_NUM_HEAD,
                num_blocks=MODEL_ENCODER_NUM_BLOCK
            )

        with self.name_scope():
            self.predict_begin = gluon.nn.Dense(
                units=1,
                use_bias=True,
                flatten=False,
                weight_initializer=Xavier(
                    rnd_type='uniform', factor_type='in', magnitude=1),
                bias_initializer=Uniform(1.0/MODEL_ENCODER_CONV_CHANNELS)
            )
            self.predict_end = gluon.nn.Dense(
                units=1,
                use_bias=True,
                flatten=False,
                weight_initializer=Xavier(
                    rnd_type='uniform', factor_type='in', magnitude=1),
                bias_initializer=Uniform(1.0/MODEL_ENCODER_CONV_CHANNELS)
            )