class BertModelLayer(Layer): """ bert """ def __init__(self, config, return_pooled_out=True, use_fp16=False): super(BertModelLayer, self).__init__() self._emb_size = config['hidden_size'] self._n_layer = config['num_hidden_layers'] self._n_head = config['num_attention_heads'] self._voc_size = config['vocab_size'] self._max_position_seq_len = config['max_position_embeddings'] self._sent_types = config['type_vocab_size'] self._hidden_act = config['hidden_act'] self._prepostprocess_dropout = config['hidden_dropout_prob'] self._attention_dropout = config['attention_probs_dropout_prob'] self.return_pooled_out = return_pooled_out self._word_emb_name = "word_embedding" self._pos_emb_name = "pos_embedding" self._sent_emb_name = "sent_embedding" self._dtype = "float16" if use_fp16 else "float32" self._param_initializer = fluid.initializer.TruncatedNormal( scale=config['initializer_range']) self._src_emb = Embedding(size=[self._voc_size, self._emb_size], param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._pos_emb = Embedding( size=[self._max_position_seq_len, self._emb_size], param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._sent_emb = Embedding(size=[self._sent_types, self._emb_size], param_attr=fluid.ParamAttr( name=self._sent_emb_name, initializer=self._param_initializer), dtype=self._dtype) self.pooled_fc = Linear(input_dim=self._emb_size, output_dim=self._emb_size, param_attr=fluid.ParamAttr( name="pooled_fc.w_0", initializer=self._param_initializer), bias_attr="pooled_fc.b_0", act="tanh") self.pre_process_layer = PrePostProcessLayer( "nd", self._emb_size, self._prepostprocess_dropout, "") self._encoder = EncoderLayer( hidden_act=self._hidden_act, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer) def emb_names(self): return self._src_emb.parameters() + self._pos_emb.parameters( ) + self._sent_emb.parameters() def forward(self, src_ids, position_ids, sentence_ids, input_mask): """ forward """ src_emb = self._src_emb(src_ids) pos_emb = self._pos_emb(position_ids) sent_emb = self._sent_emb(sentence_ids) emb_out = src_emb + pos_emb emb_out = emb_out + sent_emb emb_out = self.pre_process_layer(emb_out) self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True enc_outputs = self._encoder(emb_out, n_head_self_attn_mask) if not self.return_pooled_out: return enc_outputs next_sent_feats = [] for enc_output in enc_outputs: next_sent_feat = fluid.layers.slice(input=enc_output, axes=[1], starts=[0], ends=[1]) next_sent_feat = self.pooled_fc(next_sent_feat) next_sent_feat = fluid.layers.reshape(next_sent_feat, shape=[-1, self._emb_size]) next_sent_feats.append(next_sent_feat) return enc_outputs, next_sent_feats
class BertModelLayer(Layer): def __init__(self, emb_size=128, hidden_size=768, n_layer=12, voc_size=30522, max_position_seq_len=512, sent_types=2, return_pooled_out=True, initializer_range=1.0, conv_type="conv_bn", search_layer=False, use_fp16=False, use_fixed_gumbel=False, gumbel_alphas=None): super(BertModelLayer, self).__init__() self._emb_size = emb_size self._hidden_size = hidden_size self._n_layer = n_layer self._voc_size = voc_size self._max_position_seq_len = max_position_seq_len self._sent_types = sent_types self.return_pooled_out = return_pooled_out self.use_fixed_gumbel = use_fixed_gumbel self._word_emb_name = "s_word_embedding" self._pos_emb_name = "s_pos_embedding" self._sent_emb_name = "s_sent_embedding" self._dtype = "float16" if use_fp16 else "float32" self._conv_type = conv_type self._search_layer = search_layer self._param_initializer = fluid.initializer.TruncatedNormal( scale=initializer_range) self._src_emb = Embedding(size=[self._voc_size, self._emb_size], param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._pos_emb = Embedding( size=[self._max_position_seq_len, self._emb_size], param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._sent_emb = Embedding(size=[self._sent_types, self._emb_size], param_attr=fluid.ParamAttr( name=self._sent_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._emb_fac = Linear( input_dim=self._emb_size, output_dim=self._hidden_size, param_attr=fluid.ParamAttr(name="s_emb_factorization")) self._encoder = EncoderLayer(n_layer=self._n_layer, hidden_size=self._hidden_size, search_layer=self._search_layer, use_fixed_gumbel=self.use_fixed_gumbel, gumbel_alphas=gumbel_alphas) def emb_names(self): return self._src_emb.parameters() + self._pos_emb.parameters( ) + self._sent_emb.parameters() def max_flops(self): return self._encoder.max_flops def max_model_size(self): return self._encoder.max_model_size def arch_parameters(self): return [self._encoder.alphas] #, self._encoder.k] def forward(self, src_ids, position_ids, sentence_ids, flops=[], model_size=[]): """ forward """ ids = np.squeeze(src_ids.numpy()) sids = np.squeeze(sentence_ids.numpy()) batchsize = ids.shape[0] ids_0 = ids[((sids == 0) & (ids != 0))] seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1) y_0 = np.concatenate([np.arange(s) for s in seqlen_0]) x_0 = np.concatenate( [np.ones([s], dtype=np.int64) * i for i, s in enumerate(seqlen_0)]) ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64) ids0[(x_0, y_0)] = ids_0 ids_1 = ids[(sids == 1) & (ids != 0)] seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1) y_1 = np.concatenate([np.arange(s) for s in seqlen_1]) x_1 = np.concatenate( [np.ones([s], dtype=np.int64) * i for i, s in enumerate(seqlen_1)]) ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64) ids1[(x_1, y_1)] = ids_1 msl = max(seqlen_0.max(), seqlen_1.max()) ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]], mode='constant') ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]], mode='constant') ids0 = fluid.dygraph.to_variable(ids0) ids1 = fluid.dygraph.to_variable(ids1) src_emb_0 = self._src_emb(ids0) src_emb_1 = self._src_emb(ids1) emb_out_0 = self._emb_fac(src_emb_0) emb_out_1 = self._emb_fac(src_emb_1) # (bs, seq_len, 768) enc_outputs = self._encoder(emb_out, flops=flops, model_size=model_size) return enc_outputs
class BertModelLayer(Layer): def __init__(self, num_labels, emb_size=128, hidden_size=768, n_layer=12, voc_size=30522, max_position_seq_len=512, sent_types=2, return_pooled_out=True, initializer_range=1.0, conv_type="conv_bn", search_layer=False, use_fp16=False, use_fixed_gumbel=False, gumbel_alphas=None): super(BertModelLayer, self).__init__() self._emb_size = emb_size self._hidden_size = hidden_size self._n_layer = n_layer self._voc_size = voc_size self._max_position_seq_len = max_position_seq_len self._sent_types = sent_types self.return_pooled_out = return_pooled_out self.use_fixed_gumbel = use_fixed_gumbel self._word_emb_name = "s_word_embedding" self._pos_emb_name = "s_pos_embedding" self._sent_emb_name = "s_sent_embedding" self._dtype = "float16" if use_fp16 else "float32" self._conv_type = conv_type self._search_layer = search_layer self._param_initializer = fluid.initializer.TruncatedNormal( scale=initializer_range) self._src_emb = Embedding(size=[self._voc_size, self._emb_size], param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._pos_emb = Embedding( size=[self._max_position_seq_len, self._emb_size], param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._sent_emb = Embedding(size=[self._sent_types, self._emb_size], param_attr=fluid.ParamAttr( name=self._sent_emb_name, initializer=self._param_initializer), dtype=self._dtype) self._emb_fac = Linear( input_dim=self._emb_size, output_dim=self._hidden_size, param_attr=fluid.ParamAttr(name="s_emb_factorization")) self._encoder = EncoderLayer(num_labels=num_labels, n_layer=self._n_layer, hidden_size=self._hidden_size, search_layer=self._search_layer, use_fixed_gumbel=self.use_fixed_gumbel, gumbel_alphas=gumbel_alphas) def emb_names(self): return self._src_emb.parameters() + self._pos_emb.parameters( ) + self._sent_emb.parameters() def emb_names(self): return self._src_emb.parameters() + self._pos_emb.parameters( ) + self._sent_emb.parameters() def max_flops(self): return self._encoder.max_flops def max_model_size(self): return self._encoder.max_model_size def arch_parameters(self): return [self._encoder.alphas] #, self._encoder.k] def forward(self, data_ids, epoch): """ forward """ ids0 = data_ids[5] ids1 = data_ids[6] src_emb_0 = self._src_emb(ids0) src_emb_1 = self._src_emb(ids1) emb_out_0 = self._emb_fac(src_emb_0) emb_out_1 = self._emb_fac(src_emb_1) # (bs, seq_len, hidden_size) enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch) return enc_outputs