def build(self, inputs, scope_name=""): mask_pos = inputs["reader"]["mask_pos"] if self._is_training: mask_label = inputs["reader"]["mask_label"] max_position = inputs["reader"]["batchsize_x_seqlen"] - 1 mask_pos = fluid.layers.elementwise_min(mask_pos, max_position) mask_pos.stop_gradient = True word_emb = inputs["backbone"]["embedding_table"] enc_out = inputs["backbone"]["encoder_outputs"] emb_size = word_emb.shape[-1] _param_initializer = fluid.initializer.TruncatedNormal( scale=self._initializer_range) reshaped_emb_out = fluid.layers.reshape(x=enc_out, shape=[-1, emb_size]) # extract masked tokens' feature mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) # transform: fc mask_trans_feat = fluid.layers.fc( input=mask_feat, size=emb_size, act=self._hidden_act, param_attr=fluid.ParamAttr(name=scope_name + 'mask_lm_trans_fc.w_0', initializer=_param_initializer), bias_attr=fluid.ParamAttr(name=scope_name + 'mask_lm_trans_fc.b_0')) # transform: layer norm mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name=scope_name + 'mask_lm_trans') mask_lm_out_bias_attr = fluid.ParamAttr( name=scope_name + "mask_lm_out_fc.b_0", initializer=fluid.initializer.Constant(value=0.0)) fc_out = fluid.layers.matmul(x=mask_trans_feat, y=word_emb, transpose_y=True) fc_out += fluid.layers.create_parameter(shape=[self._vocab_size], dtype='float32', attr=mask_lm_out_bias_attr, is_bias=True) if self._is_training: inputs = fluid.layers.softmax(fc_out) mask_lm_loss = fluid.layers.cross_entropy(input=inputs, label=mask_label) loss = fluid.layers.mean(mask_lm_loss) return {'loss': loss} else: return {'logits': fc_out}
def build(self, inputs, scope_name=""): src_ids = inputs['token_ids'] pos_ids = inputs['position_ids'] sent_ids = inputs['segment_ids'] input_mask = inputs['input_mask'] task_ids = inputs['task_ids'] input_buffer = {} output_buffer = {} input_buffer['base'] = [ src_ids, pos_ids, sent_ids, input_mask, task_ids ] output_buffer['base'] = {} if self._learning_strategy == 'pairwise' and self._phase == 'train': src_ids = inputs['token_ids_neg'] pos_ids = inputs['position_ids_neg'] sent_ids = inputs['segment_ids_neg'] input_mask = inputs['input_mask_neg'] task_ids = inputs['task_ids_neg'] input_buffer['neg'] = [ src_ids, pos_ids, sent_ids, input_mask, task_ids ] output_buffer['neg'] = {} for key, (src_ids, pos_ids, sent_ids, input_mask, task_ids) in input_buffer.items(): # padding id in vocabulary must be set to 0 emb_out = fluid.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=scope_name + self._word_emb_name, initializer=self._param_initializer), is_sparse=False) # fluid.global_scope().find_var('backbone-word_embedding').get_tensor() embedding_table = fluid.default_main_program().global_block().var( scope_name + self._word_emb_name) position_emb_out = fluid.embedding( input=pos_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=scope_name + self._pos_emb_name, initializer=self._param_initializer)) sent_emb_out = fluid.embedding( sent_ids, size=[self._sent_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=scope_name + self._sent_emb_name, initializer=self._param_initializer)) emb_out = emb_out + position_emb_out emb_out = emb_out + sent_emb_out task_emb_out = fluid.embedding( task_ids, size=[self._task_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=scope_name + self._task_emb_name, initializer=self._param_initializer)) emb_out = emb_out + task_emb_out emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name=scope_name + 'pre_encoder') self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True enc_out = encoder( enc_input=emb_out, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, name=scope_name + 'encoder') next_sent_feat = fluid.layers.slice(input=enc_out, axes=[1], starts=[0], ends=[1]) next_sent_feat = fluid.layers.reshape( next_sent_feat, [-1, next_sent_feat.shape[-1]]) next_sent_feat = fluid.layers.fc( input=next_sent_feat, size=self._emb_size, act="tanh", param_attr=fluid.ParamAttr( name=scope_name + "pooled_fc.w_0", initializer=self._param_initializer), bias_attr=scope_name + "pooled_fc.b_0") output_buffer[key]['word_embedding'] = emb_out output_buffer[key]['encoder_outputs'] = enc_out output_buffer[key]['sentence_embedding'] = next_sent_feat output_buffer[key]['sentence_pair_embedding'] = next_sent_feat ret = {} ret['embedding_table'] = embedding_table ret['word_embedding'] = output_buffer['base']['word_embedding'] ret['encoder_outputs'] = output_buffer['base']['encoder_outputs'] ret['sentence_embedding'] = output_buffer['base']['sentence_embedding'] ret['sentence_pair_embedding'] = output_buffer['base'][ 'sentence_pair_embedding'] if self._learning_strategy == 'pairwise' and self._phase == 'train': ret['word_embedding_neg'] = output_buffer['neg']['word_embedding'] ret['encoder_outputs_neg'] = output_buffer['neg'][ 'encoder_outputs'] ret['sentence_embedding_neg'] = output_buffer['neg'][ 'sentence_embedding'] ret['sentence_pair_embedding_neg'] = output_buffer['neg'][ 'sentence_pair_embedding'] return ret