def __init__(self, vocab_size=32128, d_model=768, d_kv=64, d_ff=3072, num_layers=12, num_heads=12, dropout_prob=0.1, layer_norm_eps=1E-6, activation='relu', init_factor=1.0, layout='NT', dtype='float32'): super().__init__() assert vocab_size > 0, 'Vocab size {} is not valid.'.format(vocab_size) self._vocab_size = vocab_size self._d_model = d_model self._d_kv = d_kv self._d_ff = d_ff self._num_layers = num_layers self._num_heads = num_heads self._inner_dim = num_heads * d_kv self._activation = activation self._init_factor = init_factor self._dtype = dtype assert layout in ['TN', 'NT'], \ 'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout) self._layout = layout self._time_axis = 1 if self.layout == 'NT' else 0 # input embedding weights are shared between across encoder and decoder self.input_embedding_layer = nn.Embedding(input_dim=vocab_size, output_dim=d_model, weight_initializer=Normal( 1.0 * init_factor), dtype=dtype) self.encoder = T5Encoder(d_model=d_model, d_kv=d_kv, d_ff=d_ff, num_layers=num_layers, num_heads=num_heads, dropout_prob=dropout_prob, layer_norm_eps=layer_norm_eps, activation=activation, init_factor=init_factor, layout=layout, dtype=dtype) self.decoder = T5Decoder(d_model=d_model, d_kv=d_kv, d_ff=d_ff, num_layers=num_layers, num_heads=num_heads, dropout_prob=dropout_prob, layer_norm_eps=layer_norm_eps, activation=activation, init_factor=init_factor, layout=layout, dtype=dtype)
def __init__(self, d_model, d_kv, d_ff, num_layers=12, num_heads=12, dropout_prob=0.1, layer_norm_eps=1E-6, activation='relu', init_factor=1.0, layout='NT', dtype='float32'): super().__init__() self._d_model = d_model self._d_kv = d_kv self._d_ff = d_ff self._num_layers = num_layers self._num_heads = num_heads self._inner_dim = num_heads * d_kv self._dtype = dtype assert layout in ['TN', 'NT'], \ 'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout) self._layout = layout self._time_axis = 1 if self.layout == 'NT' else 0 self.relative_position_encoder = RelAttentionScoreCell( query_units=self._inner_dim, num_heads=num_heads, method='t5', bidirectional=False, embed_initializer=Normal(d_model**-0.5 * init_factor), layout='NTK' if layout == 'NT' else 'TNK', dtype=dtype) self.layers = nn.HybridSequential() for _ in range(num_layers): self.layers.add( T5Block(d_model=d_model, d_kv=d_kv, d_ff=d_ff, is_decoder=True, num_heads=num_heads, dropout_prob=dropout_prob, layer_norm_eps=layer_norm_eps, activation=activation, init_factor=init_factor, layout=layout, dtype=dtype)) self.final_layer_norm = RMSNorm(in_channels=d_model, center=False, scale=True, gamma_initializer=Constant( 1.0 * init_factor), variance_epsilon=layer_norm_eps, dtype=dtype) self.dropout = nn.Dropout(dropout_prob)
def __init__(self, d_model, d_kv, d_ff, is_decoder, num_heads=12, dropout_prob=0.1, layer_norm_eps=1E-6, activation='relu', init_factor=1.0, layout='NT', dtype='float32'): super().__init__() self._d_model = d_model self._d_kv = d_kv self._d_ff = d_ff self._is_decoder = is_decoder self._num_heads = num_heads self._inner_dim = self._num_heads * self._d_kv self._dtype = dtype assert layout in ['TN', 'NT'], \ 'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout) self._layout = layout self._time_axis = 1 if self.layout == 'NT' else 0 self.self_attn_layer_norm = RMSNorm(in_channels=d_model, center=False, scale=True, gamma_initializer=Constant( 1.0 * init_factor), variance_epsilon=layer_norm_eps, dtype=dtype) # avoid scaling before softmax # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 self.self_attn_q = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( (d_model * d_kv)**-0.5 * init_factor), dtype=dtype) self.self_attn_k = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal(d_model**-0.5 * init_factor), dtype=dtype) self.self_attn_v = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal(d_model**-0.5 * init_factor), dtype=dtype) self.self_attn = MultiHeadAttentionCell( query_units=self._inner_dim, num_heads=num_heads, attention_dropout=dropout_prob, scaled=False, normalized=False, dtype=dtype, layout='NTK' if layout == 'NT' else 'TNK', use_einsum=False) self.self_attn_proj = nn.Dense( units=d_model, in_units=self._inner_dim, flatten=False, use_bias=False, weight_initializer=Normal(self._inner_dim**-0.5 * init_factor), dtype=dtype) if is_decoder: self.cross_attn_layer_norm = RMSNorm( in_channels=d_model, center=False, scale=True, gamma_initializer=Constant(1.0 * init_factor), variance_epsilon=layer_norm_eps, dtype=dtype) # avoid scaling before softmax self.cross_attn_q = nn.Dense( units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( (d_model * d_kv)**-0.5 * init_factor), dtype=dtype) self.cross_attn_k = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( d_model**-0.5 * init_factor), dtype=dtype) self.cross_attn_v = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( d_model**-0.5 * init_factor), dtype=dtype) self.cross_attn = MultiHeadAttentionCell( query_units=self._inner_dim, num_heads=num_heads, attention_dropout=dropout_prob, scaled=False, normalized=False, dtype=dtype, layout='NTK' if layout == 'NT' else 'TNK', use_einsum=False) self.cross_attn_proj = nn.Dense( units=d_model, in_units=self._inner_dim, flatten=False, use_bias=False, weight_initializer=Normal(self._inner_dim**-0.5 * init_factor), dtype=dtype) assert activation in ['relu', 'gated-gelu'], \ '{} is not supported. Please choose from "relu" and "gated-gelu"'.format(activation) # the weight_initializer here is equivalent to Normal(in_units ** -0.5 * init_factor) self.ffn = PositionwiseFFN( units=d_model, hidden_size=d_ff, use_bias=False, activation_dropout=dropout_prob, dropout=dropout_prob, weight_initializer=Xavier('gaussian', 'in', np.sqrt(init_factor)), activation='relu' if activation == 'relu' else 'gelu(tanh)', use_gated_activation=False if activation == 'relu' else True, normalization='rms_norm', layer_norm_eps=layer_norm_eps, pre_norm=True, dtype=dtype, center=False, scale=True, gamma_initializer=Constant(1.0 * init_factor)) self.dropout = nn.Dropout(dropout_prob)
def __init__(self, **kwargs): super(QANet, self).__init__(**kwargs) with self.name_scope(): self.flatten = gluon.nn.Flatten() self.dropout = gluon.nn.Dropout(opt.layers_dropout) self.char_conv = ConvolutionalEncoder( embed_size=opt.char_emb_dim, num_filters=opt.char_conv_filters, ngram_filter_sizes=opt.char_conv_ngrams, conv_layer_activation=None, num_highway=0) self.highway = gluon.nn.HybridSequential() with self.highway.name_scope(): self.highway.add( gluon.nn.Dense(units=opt.emb_encoder_conv_channels, flatten=False, use_bias=False, weight_initializer=Xavier())) self.highway.add( Highway(input_size=opt.emb_encoder_conv_channels, num_layers=opt.highway_layers, activation='relu', highway_bias=HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=0.0))) self.word_emb = gluon.nn.HybridSequential() with self.word_emb.name_scope(): self.word_emb.add( gluon.nn.Embedding(input_dim=opt.word_corpus, output_dim=opt.word_emb_dim)) self.word_emb.add(gluon.nn.Dropout(rate=opt.word_emb_dropout)) self.char_emb = gluon.nn.HybridSequential() with self.char_emb.name_scope(): self.char_emb.add( gluon.nn.Embedding(input_dim=opt.character_corpus, output_dim=opt.char_emb_dim, weight_initializer=Normal(sigma=0.1))) self.char_emb.add(gluon.nn.Dropout(rate=opt.char_emb_dropout)) with self.name_scope(): self.emb_encoder = Encoder( kernel_size=opt.emb_encoder_conv_kernerl_size, num_filters=opt.emb_encoder_conv_channels, conv_layers=opt.emb_encoder_num_conv_layers, num_heads=opt.emb_encoder_num_head, num_blocks=opt.emb_encoder_num_block) self.project = gluon.nn.Dense(units=opt.emb_encoder_conv_channels, flatten=False, use_bias=False, weight_initializer=Xavier()) with self.name_scope(): self.co_attention = CoAttention() with self.name_scope(): self.model_encoder = Encoder( kernel_size=opt.model_encoder_conv_kernel_size, num_filters=opt.model_encoder_conv_channels, conv_layers=opt.model_encoder_conv_layers, num_heads=opt.model_encoder_num_head, num_blocks=opt.model_encoder_num_block) with self.name_scope(): self.predict_begin = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier(rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0 / opt.model_encoder_conv_channels)) self.predict_end = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier(rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0 / opt.model_encoder_conv_channels))
def net_initialize(net, model_ctx, initializer: (str, Initializer, dict, list) = mx.init.Xavier(), select=None, logger=logging, verbose=False, force_reinit=False): """ 初始化网络参数 Parameters ---------- net model_ctx: mx.cpu or mx.gpu initializer: str, Initializer, dict or list, tuple select logger verbose : bool, default False Whether to verbosely print out details on initialization. force_reinit : bool, default False Whether to force re-initialization if parameter is already initialized. Notes ------ The developer who modify this document should simultaneously modify the related function in glue Examples -------- >>> import mxnet as mx >>> from mxnet import gluon >>> emb = gluon.nn.Embedding(2, 3) >>> net_initialize(emb, mx.cpu()) >>> emb.weight.data() <BLANKLINE> [[0.10694504 0.2034123 0.4714563 ] [0.7542485 0.2251432 0.7842196 ]] <NDArray 2x3 @cpu(0)> >>> emb1 = gluon.nn.Embedding(2, 3) >>> net_initialize(emb1, mx.cpu(), initializer=mx.init.Xavier()) >>> emb1.weight.data() <BLANKLINE> [[ 0.09833419 0.76079047 -0.16726398] [ 0.27071452 0.319638 -0.25330698]] <NDArray 2x3 @cpu(0)> >>> class EmbNet(gluon.nn.HybridBlock): ... def __init__(self, prefix=None, params=None): ... super(EmbNet, self).__init__(prefix, params) ... with self.name_scope(): ... self.emb = gluon.nn.Embedding(2, 3) ... self.linear = gluon.nn.Dense(4) ... def hybrid_forward(self, F, x, *args, **kwargs): ... return self.linear(self.emb(x)) >>> net = EmbNet() >>> from longling.ML.DL import BLOCK_EMBEDDING >>> net_initialize(net, mx.cpu(), initializer={BLOCK_EMBEDDING: "xaiver", ".*embedding": "uniform"}) >>> net(mx.nd.array([0, 1])) <BLANKLINE> [[ 0.03268543 -0.00860071 0.04774952 0.00056277] [-0.00648303 -0.03121923 -0.04578817 -0.08059631]] <NDArray 2x4 @cpu(0)> >>> net1 = EmbNet() >>> net_initialize(net1, mx.cpu(), initializer=["xaiver", "uniform"], select=[BLOCK_EMBEDDING, ".*embedding"]) >>> net1(mx.nd.array([0, 1])) # doctest: +ELLIPSIS <BLANKLINE> [[-0.0896... -0.0179... -0.0156... -0.0136...] [ 0.0033... 0.0255... 0.0111... 0.0446...]] <NDArray 2x4 @cpu(0)> >>> net_initialize(net1, mx.cpu(), initializer=[(BLOCK_EMBEDDING, "xaiver"), (".*embedding", "uniform")], ... force_reinit=True) >>> net1(mx.nd.array([0, 1])) # doctest: +ELLIPSIS <BLANKLINE> [[ 0.0153... 0.0266... -0.0466... 0.0291...] [-0.0362... 0.0063... 0.0227... -0.0212...]] <NDArray 2x4 @cpu(0)> """ if isinstance(initializer, str): initializer = { "xaiver": Xavier(), "uniform": Uniform(), "normal": Normal() }[initializer] elif isinstance(initializer, dict): for _select, _initializer in initializer.items(): net_initialize(net, model_ctx=model_ctx, initializer=_initializer, select=_select, logger=logger, verbose=verbose, force_reinit=force_reinit) return elif isinstance(initializer, (list, tuple)): if select is not None: assert len(select) == len(initializer) for _select, _initializer in zip(select, initializer): net_initialize(net, model_ctx=model_ctx, initializer=_initializer, select=_select, logger=logger, verbose=verbose, force_reinit=force_reinit) else: for _select, _initializer in initializer: net_initialize(net, model_ctx=model_ctx, initializer=_initializer, select=_select, logger=logger, verbose=verbose, force_reinit=force_reinit) return elif initializer is None or isinstance(initializer, Initializer): pass else: raise TypeError( "initializer should be either str or Initializer, now is", type(initializer)) logger.info("initializer: %s, select: %s, ctx: %s" % (initializer, select, model_ctx)) net.collect_params(select).initialize(initializer, ctx=model_ctx, verbose=verbose, force_reinit=force_reinit)
def __init__(self, **kwargs): super(QANet, self).__init__(**kwargs) with self.name_scope(): self.flatten = gluon.nn.Flatten() self.dropout = gluon.nn.Dropout(LAYERS_DROPOUT) self.char_conv = gluon.nn.Conv1D( channels=EMB_ENCODER_CONV_CHANNELS, kernel_size=5, activation='relu', weight_initializer=MSRAPrelu(), use_bias=True, padding=5 // 2 ) self.highway = gluon.nn.HybridSequential() with self.highway.name_scope(): self.highway.add( gluon.nn.Dense( units=EMB_ENCODER_CONV_CHANNELS, flatten=False, use_bias=False, weight_initializer=Xavier() ) ) self.highway.add( Highway( input_size=EMB_ENCODER_CONV_CHANNELS, num_layers=NUM_HIGHWAY_LAYERS ) ) self.word_emb = gluon.nn.HybridSequential() with self.word_emb.name_scope(): self.word_emb.add( gluon.nn.Embedding( input_dim=CORPUS_WORDS, output_dim=DIM_WORD_EMBED ) ) self.word_emb.add( gluon.nn.Dropout(rate=WORD_EMBEDDING_DROPOUT) ) self.char_emb = gluon.nn.HybridSequential() with self.char_emb.name_scope(): self.char_emb.add( gluon.nn.Embedding( input_dim=CORPUS_CHARACTERS, output_dim=DIM_CHAR_EMBED, weight_initializer=Normal(sigma=0.1) ) ) self.char_emb.add( gluon.nn.Dropout(rate=CHAR_EMBEDDING_DROPOUT) ) with self.name_scope(): self.emb_encoder = Encoder( kernel_size=EMB_ENCODER_CONV_KERNEL_SIZE, num_filters=EMB_ENCODER_CONV_CHANNELS, conv_layers=EMB_ENCODER_NUM_CONV_LAYERS, num_heads=EMB_ENCODER_NUM_HEAD, num_blocks=EMB_ENCODER_NUM_BLOCK ) self.project = gluon.nn.Dense( units=EMB_ENCODER_CONV_CHANNELS, flatten=False, use_bias=False, weight_initializer=Xavier() ) with self.name_scope(): self.co_attention = CoAttention() with self.name_scope(): self.model_encoder = Encoder( kernel_size=MODEL_ENCODER_CONV_KERNEL_SIZE, num_filters=MODEL_ENCODER_CONV_CHANNELS, conv_layers=MODEL_ENCODER_NUM_CONV_LAYERS, num_heads=MODEL_ENCODER_NUM_HEAD, num_blocks=MODEL_ENCODER_NUM_BLOCK ) with self.name_scope(): self.predict_begin = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier( rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0/MODEL_ENCODER_CONV_CHANNELS) ) self.predict_end = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier( rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0/MODEL_ENCODER_CONV_CHANNELS) )