def prepare_encoder(src_word, src_pos, src_vocab_size, src_emb_dim, src_pad_idx, src_max_len, dropout=0., pos_pad_idx=0, pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ src_word_emb = layers.embedding( src_word, size=[src_vocab_size, src_emb_dim], padding_idx=src_pad_idx, param_attr=fluid.initializer.Normal(0., 1.)) src_pos_enc = layers.embedding( src_pos, size=[src_max_len, src_emb_dim], padding_idx=pos_pad_idx, param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) enc_input = src_word_emb + src_pos_enc # FIXME(guosheng): Decouple the program desc with batch_size. enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim]) return layers.dropout( enc_input, dropout_prob=dropout, is_test=False) if dropout else enc_input
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): """ Scaled Dot-Product Attention """ # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op. # The current implementation of softmax_op only supports 2D tensor, # consequently it cannot be directly used here. # If to use the reshape_op, Besides, the shape of product inferred in # compile-time is not the actual shape in run-time. It cann't be used # to set the attribute of reshape_op. # So, here define the softmax for temporary solution. def __softmax(x, eps=1e-9): exp_out = layers.exp(x=x) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) scaled_q = layers.scale(x=q, scale=d_model**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) weights = __softmax(layers.elementwise_add(x=product, y=attn_bias)) if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, is_test=False) out = layers.matmul(weights, v) return out
def call(self, step_input, states): new_lstm_states = [] for i in range(self.num_layers): out, new_lstm_state = self.lstm_cells[i](step_input, states[i]) step_input = layers.dropout( out, self.dropout_prob, ) if self.dropout_prob else out new_lstm_states.append(new_lstm_state) return step_input, new_lstm_states
def dropout(input): """ dropout """ dropout_rate = 0.5 return layers.dropout(input, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False)
def _gen_dec_input(self, trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias): emb_out = fluid.layers.embedding( input=trg_word, size=[self.voc_size, self._emb_size], padding_idx=self._padding_idx, # set embedding of pad to 0 dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) emb_out = layers.scale(x=emb_out, scale=self._emb_size**0.5) position_emb_out = fluid.layers.embedding( input=trg_pos, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._dec_word_pos_emb_name, trainable=False)) position_emb_out.stop_gradient = True emb_out = emb_out + position_emb_out emb_out = layers.dropout( emb_out, dropout_prob=self._prepostprocess_dropout, dropout_implementation="upscale_in_train", is_test=False) if self._prepostprocess_dropout else emb_out if self._dtype is "float16": emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype) if trg_slf_attn_bias is not None: trg_slf_attn_bias = fluid.layers.cast(x=trg_slf_attn_bias, dtype=self._dtype) if trg_src_words_attn_bias is not None: trg_src_words_attn_bias = fluid.layers.cast( x=trg_src_words_attn_bias, dtype=self._dtype) if trg_src_sents_attn_bias is not None: trg_src_sents_attn_bias = fluid.layers.cast( x=trg_src_sents_attn_bias, dtype=self._dtype) if graph_attn_bias is not None: graph_attn_bias = fluid.layers.cast(x=graph_attn_bias, dtype=self._dtype) res = namedtuple('results', [ 'emb_out', 'trg_slf_attn_bias', 'trg_src_words_attn_bias', 'trg_src_sents_attn_bias', 'graph_attn_bias' ]) return res(emb_out=emb_out, trg_slf_attn_bias=trg_slf_attn_bias, trg_src_words_attn_bias=trg_src_words_attn_bias, trg_src_sents_attn_bias=trg_src_sents_attn_bias, graph_attn_bias=graph_attn_bias)
def get_single_direction_output(rnn_input, encode_hidden, unit_list, mask=None, direc_index=0): rnn = StaticRNN() #print(rnn_input.shape) with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size], ref_batch_dim_idx=1) encode_h = encode_hidden[i] pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1) new_hidden = unit_list[i](step_input, pre_encode_hidden) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul( pre_hidden, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.step_output(new_hidden) step_input = new_hidden if dropout_prob is not None and dropout_prob > 0.0: step_input = layers.dropout(step_input, dropout_prob=dropout_prob, ) rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] all_hidden_array = [] # 增加这个来得到所有隐含状态 rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i] all_hidden_array.append(last_hidden) last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) all_hidden_array = layers.concat(all_hidden_array, axis=0) all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size]) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, all_hidden_array
def __init__(self, cfg, name=None): super(ErnieSiameseNet, self).__init__(cfg, name=None) self.triplet_margin = cfg.pop("triplet_margin", 1.0) logging.info("triplet_margin: {}".format(self.triplet_margin)) prob = cfg.get('classifier_dropout_prob', cfg['hidden_dropout_prob']) logging.info("emb dropout: {}".format(prob)) self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i
def __init__(self, cfg, name=None): super(PositionwiseFeedForwardLayer, self).__init__() initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) d_model = cfg['hidden_size'] d_ffn = cfg.get('intermediate_size', 4 * d_model) assert cfg['hidden_act'] in ['relu', 'gelu'] self.i = _build_linear(d_model, d_ffn, append_name(name, 'fc_0'), initializer, act=cfg['hidden_act']) self.o = _build_linear(d_ffn, d_model, append_name(name, 'fc_1'), initializer) prob = cfg.get('intermediate_dropout_prob', 0.) self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i
def __call__(self, input): if self.dropout_prob > 0.0: return layers.dropout(input, dropout_prob=self.dropout_prob, is_test=self.is_test, seed=self.seed, name=self.name, dropout_implementation='upscale_in_train') else: return input
def dropout(input, test_mode, args): if args.dropout and (not test_mode): return layers.dropout( input, dropout_prob=args.dropout, dropout_implementation="upscale_in_train", seed=args.random_seed, is_test=False) else: return input
def dropout(input, args): """Dropout function""" if args.drop_rate: return layers.dropout( input, dropout_prob=args.drop_rate, seed=args.random_seed, is_test=False) else: return input
def build_model(self): node_features = self.graph_wrapper.node_feat["feat"] output = self.gcn(gw=self.graph_wrapper, feature=node_features, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_1") output1 = output output = self.gcn(gw=self.graph_wrapper, feature=output, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_2") output2 = output output = self.gcn(gw=self.graph_wrapper, feature=output, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_3") output = L.concat(input=[output1, output2, output], axis=-1) output, ratio_length = sag_pool(gw=self.graph_wrapper, feature=output, ratio=self.pooling_ratio, graph_id=self.graph_id, dataset=self.args.dataset_name, name="sag_pool_1") output = L.lod_reset(output, self.graph_wrapper.graph_lod) cat1 = L.sequence_pool(output, "sum") ratio_length = L.cast(ratio_length, dtype="float32") cat1 = L.elementwise_div(cat1, ratio_length, axis=-1) cat2 = L.sequence_pool(output, "max") output = L.concat(input=[cat2, cat1], axis=-1) output = L.fc(output, size=self.hidden_size, act="relu") output = L.dropout(output, dropout_prob=self.dropout_ratio) output = L.fc(output, size=self.hidden_size // 2, act="relu") output = L.fc(output, size=self.num_classes, act=None, param_attr=fluid.ParamAttr(name="final_fc")) self.labels = L.cast(self.labels, dtype="float32") loss = L.sigmoid_cross_entropy_with_logits(x=output, label=self.labels) self.loss = L.mean(loss) pred = L.sigmoid(output) self.pred = L.argmax(x=pred, axis=-1) correct = L.equal(self.pred, self.labels_1dim) correct = L.cast(correct, dtype="int32") self.correct = L.reduce_sum(correct)
def rnn_decoder(gru_unit, cue_gru_unit, input, input_size, hidden_size, num_layers, memory, memory_mask, knowledge, output_size, init_hidden=None, mask=None, dropout=0.0, batch_first=True, name="decoder"): """ rnn decoder """ input_emb = get_embedding(input, input_size, output_size) if batch_first: input_emb = layers.transpose(input_emb, perm=[1, 0, 2]) if mask: trans_mask = layers.transpose(mask, perm=[1, 0]) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(input_emb) step_mask = None if mask: step_mask = rnn.step_input(trans_mask) # split pre_hidden pre_hidden_list = [] pre_hidden = rnn.memory(init=init_hidden) real_out, last_hidden = \ decoder_step(gru_unit, cue_gru_unit, step_in, pre_hidden, input_size, hidden_size, memory, memory_mask, knowledge, mask=step_mask) rnn.update_memory(pre_hidden, last_hidden) step_in = layers.squeeze(real_out, axes=[1]) rnn.step_output(step_in) rnnout = rnn() rnnout = layers.transpose(rnnout, perm=[1, 0, 2]) rnnout = layers.elementwise_mul(rnnout, mask, axis=0) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(rnnout, dropout_prob=dropout) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, output_size, name='dec_out_fc2') softmax_out = layers.softmax(rnnout) return softmax_out
def __init__(self, cfg, name=None): super(ErnieBlock, self).__init__() d_model = cfg['hidden_size'] initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) self.attn = AttentionLayer(cfg, name = append_name(name, 'multi_head_att')) self.ln1 = _build_ln(d_model, name = append_name(name, 'post_att')) self.ffn = PositionwiseFeedForwardLayer(cfg, name = append_name(name, 'ffn')) self.ln2 = _build_ln(d_model, name = append_name(name, 'post_ffn')) prob = cfg.get('intermediate_dropout_prob', cfg['hidden_dropout_prob']) self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i
def call(self, step_input, states): new_states = [] for i in range(self.num_layers): out, new_state = self.lstm_cells[i](step_input, states[i]) step_input = layers.dropout( out, self.dropout_prob, dropout_implementation="upscale_in_train" ) if self.dropout_prob > 0 else out new_states.append(new_state) return step_input, new_states
def forward(self, src_word, src_pos, src_slf_attn_bias): word_emb = self.word_embedder(src_word) word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5) pos_enc = self.pos_encoder(src_pos) pos_enc.stop_gradient = True emb = word_emb + pos_enc enc_input = layers.dropout(emb, dropout_prob=self.emb_dropout, is_test=False) if self.emb_dropout else emb enc_output = self.encoder(enc_input, src_slf_attn_bias) return enc_output
def forward(self, seq, mask=None): h = self.conv1d(seq) g, h = h[:, :, :self.num_filters], h[:, :, self.num_filters:] if self.dropout_rate: g = layers.dropout(g, dropout_prob=self.dropout_rate, dropout_implementation="upscale_in_train", is_test=not self.training) g = layers.sigmoid(g) seq = g * seq + (1 - g) * h if mask is not None: seq = seq * mask return seq
def forward(self, step_input, states): new_states = [] for i, lstm_cell in enumerate(self.lstm_cells): out, new_state = lstm_cell(step_input, states[i]) step_input = layers.dropout( out, self.dropout_prob, dropout_implementation='upscale_in_train' ) if self.dropout_prob > 0 else out new_states.append(new_state) return step_input, new_states
def forward(self, x, speaker_embed=None): """ Encode text sequence. Args: x (Variable): shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of decoder input x. speaker_embed (Variable, optional): shape(B, C_sp), dtype float32, speaker embeddings. This arg is not None only when the model is a multispeaker model. Returns: keys (Variable), Shape(B, T_enc, C_emb), dtype float32, the encoded epresentation for keys, where C_emb menas the text embedding size. values (Variable), Shape(B, T_enc, C_emb), dtype float32, the encoded representation for values. """ x = self.embed(x) x = F.dropout(x, self.dropout, dropout_implementation="upscale_in_train") x = F.transpose(x, [0, 2, 1]) if self.n_speakers > 1 and speaker_embed is not None: speaker_embed = F.dropout( speaker_embed, self.dropout, dropout_implementation="upscale_in_train") x = F.elementwise_add(x, self.sp_proj1(speaker_embed), axis=0) input_embed = x for layer in self.convolutions: if isinstance(layer, Conv1DGLU): x = layer(x, speaker_embed) else: # layer is a Conv1D with (1,) filter wrapped by WeightNormWrapper x = layer(x) if self.n_speakers > 1 and speaker_embed is not None: x = F.elementwise_add(x, self.sp_proj2(speaker_embed), axis=0) keys = x # (B, C, T) values = F.scale(input_embed + x, scale=np.sqrt(0.5)) keys = F.transpose(keys, [0, 2, 1]) values = F.transpose(values, [0, 2, 1]) return keys, values
def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze( self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def forward(self, features): src_ids, sent_ids = features dtype = 'float16' if self.hparam['fp16'] else 'float32' zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0 #input_mask = L.unsqueeze(input_mask, axes=[2]) d_shape = L.shape(src_ids) seqlen = d_shape[1] batch_size = d_shape[0] pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) pos_ids = L.expand(pos_ids, [batch_size, 1]) pos_ids = L.unsqueeze(pos_ids, axes=[2]) pos_ids = L.cast(pos_ids, 'int64') pos_ids.stop_gradient = True input_mask.stop_gradient = True task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment task_ids.stop_gradient = True bert = ErnieModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, config=self.hparam, use_fp16=self.hparam['fp16'] ) cls_feats = bert.get_pooled_output() cls_feats = L.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train" ) logits = L.fc( input=cls_feats, size=self.hparam['num_label'], param_attr=F.ParamAttr( name="cls_out_w", initializer=F.initializer.TruncatedNormal(scale=0.02)), bias_attr=F.ParamAttr( name="cls_out_b", initializer=F.initializer.Constant(0.)) ) propeller.summary.histogram('pred', logits) if self.mode is propeller.RunMode.PREDICT: probs = L.softmax(logits) return probs else: return logits
def dropout(input, test_mode, args): dropout1 = 0.1 test_mode = False random_seed = 123 if dropout1 and (not test_mode): return layers.dropout(input, dropout_prob=dropout1, dropout_implementation="upscale_in_train", seed=random_seed, is_test=False) else: return input
def graph_scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate, graph_attn_bias, pos_win): """ Graph Scaled Dot-Product Attention :param q: (batch_size, n_heads, query_len, dim_per_head) :param k: (batch_size, n_heads, key_s_len, dim_per_head) :param v: (batch_size, n_heads, key_s_len, dim_per_head) :param attn_bias: (batch_size, n_head, query_len, key_s_len) :param graph_attn_bias: (batch_size, n_head, key_s_len, key_s_len) :return: proj_out: [batch, n_heads, query_len, dim_per_hed] weights: [batch, n_heads, query_len, key_s_len] """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul( x=scaled_q, y=k, transpose_y=True) # (batch_size, n_heads, tgt_len, n_block) if attn_bias: product += attn_bias # (batch_size, n_heads, tgt_len, n_block) if graph_attn_bias: # re-weight the attention score with gaussian weights gaussian_w = __compute_graph_bias(scaled_q, graph_attn_bias, pos_win) product += gaussian_w # [batch, n_heads, query_len, key_s_len] weights = layers.softmax( product) # [batch, n_heads, query_len, key_s_len] if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, seed=dropout_seed, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) # [batch, n_heads, query_len, dim_per_hed] # Project back to the model size. combine_out = __combine_heads_sent( out) # (batch_size, query_len, emb_dim) proj_out = layers.fc(input=combine_out, size=d_model, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_sen_fc.w_0', initializer=param_initializer), bias_attr=name + '_sen_fc.b_0') return proj_out, weights
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def forward(self, x): """ forward :param x: :return: """ hidden = self._i2h(x) if self._dropout_rate: hidden = layers.dropout(hidden, dropout_prob=self._dropout_rate, is_test=False) out = self._h2o(hidden) return out
def call(self, step_input, states): lstm_states = states new_lstm_states = [] step_input = layers.concat([step_input, self.latent_z], 1) for i in range(self.num_layers): out, lstm_state = self.lstm_cells[i](step_input, lstm_states[i]) step_input = layers.dropout( out, self.dropout_prob, dropout_implementation="upscale_in_train" ) if self.dropout_prob > 0 else out new_lstm_states.append(lstm_state) return step_input, new_lstm_states
def _build_encoder(self): self.enc_input = layers.dropout( self.src_emb, dropout_prob=self.enc_dropout_in, dropout_implementation="upscale_in_train") enc_cell = EncoderCell(self.num_layers, self.hidden_size, self.param_attr_initializer, self.param_attr_scale, self.enc_dropout_out) enc_output, enc_final_state = rnn( cell=enc_cell, inputs=self.enc_input, sequence_length=self.src_sequence_length) return enc_output, enc_final_state
def __call__(self, input, hidden): assert len(hidden) == self.num_layers new_hidden = [] for cell, hid in zip(self.cells, hidden): input, new_hid = cell(input, hid) new_hidden += [new_hid] if self.dropout > 0: input = layers.dropout( input, dropout_prob=self.dropout, dropout_implementation='upscale_in_train') output = new_hidden[-1] return output, new_hidden
def prepare_encoder(src_word, src_pos, src_vocab_size, src_phone, src_phone_mask, phone_vocab_size, src_emb_dim, src_max_len, beta=0.0, dropout_rate=0., bos_idx=0, phone_pad_idx=-1, word_emb_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ src_word_emb = layers.embedding( src_word, size=[src_vocab_size, src_emb_dim], padding_idx=bos_idx, # set embedding of bos to 0 param_attr=fluid.ParamAttr( name=word_emb_param_name, initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) # shape [batch_size, max_seq_len, max_phone_len, dim] src_phone_emb = layers.embedding( src_phone, size=[phone_vocab_size, src_emb_dim], padding_idx=phone_pad_idx, # set embedding of phone_pad_idx to 0 param_attr=fluid.ParamAttr( name=phone_emb_param_name, initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) sum_phone_emb = layers.reduce_sum(src_phone_emb, dim=2) float_mask = layers.cast(src_phone_mask, dtype='float32') sum_mask = layers.reduce_sum(float_mask, dim=2) + 1e-9 mean_phone_emb = layers.elementwise_div(sum_phone_emb, sum_mask, axis=0) src_pos_enc = layers.embedding( src_pos, size=[src_max_len, src_emb_dim], param_attr=fluid.ParamAttr( name=pos_enc_param_names[0], trainable=False)) src_pos_enc.stop_gradient = True enc_input = ( 1 - beta) * src_word_emb + beta * mean_phone_emb + src_pos_enc return layers.dropout( enc_input, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) if dropout_rate else enc_input
def __init__(self, cfg, name=None): super(ErnieModelForTokenClassification, self).__init__(cfg, name=name) initializer = F.initializer.TruncatedNormal( scale=cfg['initializer_range']) self.classifier = _build_linear(cfg['hidden_size'], cfg['num_labels'], append_name(name, 'cls'), initializer) prob = cfg.get('classifier_dropout_prob', cfg['hidden_dropout_prob']) self.dropout = lambda i: L.dropout( i, dropout_prob=prob, dropout_implementation="upscale_in_train", ) if self.training else i
def __scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """Scaled Dot-Product Attention""" product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5) if attn_bias: product += attn_bias weights = layers.softmax(product, use_cudnn=True) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def embedding(self, t1, t2, mask=None): pv = self.pe(position_id(t1)) t1 = self.ce(t1) t2 = self.we(self.we_p(t2)) t = t1 + t2 + pv if self.dropout_rate: t = layers.dropout( t, dropout_prob=self.dropout_rate, dropout_implementation="upscale_in_train", is_test=not self.training) if mask is not None: t = t * mask return t
def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.): """ Add residual connection, layer normalization and droput to the out tensor optionally according to the value of process_cmd. This will be used before or after multi-head attention and position-wise feed-forward networks. """ for cmd in process_cmd: if cmd == "a": # add residual connection out = out + prev_out if prev_out else out elif cmd == "n": # add layer normalization out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.initializer.Constant(1.), bias_attr=fluid.initializer.Constant(0.)) elif cmd == "d": # add dropout if dropout: out = layers.dropout(out, dropout_prob=dropout, is_test=False) return out