def _build_model(self, src_ids, position_ids, sentence_ids, self_attn_mask): # padding id in vocabulary must be set to 0 emb_out = fluid.layers.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) position_emb_out = fluid.layers.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer)) sent_emb_out = fluid.layers.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._sent_emb_name, initializer=self._param_initializer)) emb_out = emb_out + position_emb_out emb_out = emb_out + sent_emb_out emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') if self._dtype == "float16": self_attn_mask = fluid.layers.cast(x=self_attn_mask, dtype=self._dtype) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out = encoder( enc_input=emb_out, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, name='encoder')
def get_pretraining_output(self, mask_label, mask_pos, labels): """Get the loss & accuracy for pretraining""" mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') # extract the first token feature in each sentence next_sent_feat = self.get_pooled_output() reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size]) # extract masked tokens' feature mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) # transform: fc mask_trans_feat = fluid.layers.fc(input=mask_feat, size=self._emb_size, act=self._hidden_act, param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0', initializer=self._param_initializer), bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) # transform: layer norm mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans') mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0", initializer=fluid.initializer.Constant(value=0.0)) if self._weight_sharing: fc_out = fluid.layers.matmul(x=mask_trans_feat, y=fluid.default_main_program().global_block().var(self._word_emb_name), transpose_y=True) fc_out += fluid.layers.create_parameter(shape=[self._voc_size], dtype=self._dtype, attr=mask_lm_out_bias_attr, is_bias=True) else: fc_out = fluid.layers.fc(input=mask_trans_feat, size=self._voc_size, param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", initializer=self._param_initializer), bias_attr=mask_lm_out_bias_attr) mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label) mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) next_sent_fc_out = fluid.layers.fc(input=next_sent_feat, size=2, param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", initializer=self._param_initializer), bias_attr="next_sent_fc.b_0") next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out, label=labels, return_softmax=True) next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels) mean_next_sent_loss = fluid.layers.mean(next_sent_loss) loss = mean_next_sent_loss + mean_mask_lm_loss return next_sent_acc, mean_mask_lm_loss, loss
def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, input_mask): # padding id in vocabulary must be set to 0 emb_out = fluid.layers.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) # padding id in vocabulary must be set to 0 if self._dtype == core.VarDesc.VarType.FP16: emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype) input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) # shape as [-1, 1, max_seq_len, embedding] emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') for i in range(self._n_layer): prev_out = emb_out if (i == 0) and ("CACHE_CNN" in os.environ): emb_out = self.cache_cnn(src_ids) else: prev_out = emb_out emb_out = fluid.layers.unsqueeze(emb_out, 1) emb_out = fluid.layers.conv2d( emb_out, num_filters=self._emb_size, filter_size=(1, 3), act=self._hidden_act, data_format="NHWC", padding=str("SAME"), param_attr=fluid.ParamAttr( name='conv_%s_fc.w_0' % i, initializer=self._param_initializer), bias_attr='conv_%s_fc.b_0' % i) emb_out = fluid.layers.squeeze(emb_out, [1]) emb_out = pre_post_process_layer(prev_out, emb_out, "dan", self._prepostprocess_dropout, name="conv_post_%s" % i) self._enc_out = emb_out if self._dtype == core.VarDesc.VarType.FP16: self._enc_out = fluid.layers.cast(x=self._enc_out, dtype=self._emb_dtype)
def _gen_input(self, emb_ids=None, input_mask=None, image_input=None, emb_obj_ids=None, input_type=None): assert input_mask is not None, "input_mask should not be none" assert input_type is not None, "input_type should not be none" self_attn_mask = input_mask self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True emb_feature, _v_seq_len, _o_seq_len = None, None, None if emb_ids is not None: emb_out = None # text part for emb_name, emb_id in emb_ids.items(): if emb_name == "sent_embedding": continue # don't use sentence embedding emb = fluid.layers.embedding( input=emb_id, size=[self._emb_vocab_size[emb_name], self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=emb_name, initializer=self._param_initializer)) emb_out = emb_out + emb if emb_out else emb if self.text_adv_delta is not None: emb_out = emb_out + self.text_adv_delta emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name="pre_encoder") if image_input is not None: # visual part if self.image_adv_delta is not None: emb_v_in = image_input[self._image_emb_name] emb_v_in = emb_v_in + self.image_adv_delta else: emb_v_in = image_input[self._image_emb_name] image_embeddings = fluid.layers.fc( emb_v_in, # [batch_size, 37, 2048] self._emb_size, param_attr=fluid.ParamAttr( name="image_emb.w_0", initializer=self._param_initializer), bias_attr="image_emb.b_0", num_flatten_dims=2) loc_emb_out = fluid.layers.fc( image_input[self._loc_emb_name], # [batch_size, 37, 5] self._emb_size, param_attr=fluid.ParamAttr( name="image_loc.w_0", initializer=self._param_initializer), bias_attr="image_loc.b_0", num_flatten_dims=2) emb_v_out = image_embeddings + loc_emb_out emb_v_out = pre_process_layer(emb_v_out, 'nd', self._prepostprocess_dropout, name='v_pre_encoder') _v_seq_len = layers.shape(emb_v_out)[1] if emb_obj_ids is not None: emb_obj_out = None # text part for emb_obj_name, emb_obj_id in emb_obj_ids.items(): if emb_obj_name == "sent_embedding": continue # don't use sentence embedding in roberta emb_obj = fluid.layers.embedding( input=emb_obj_id, size=[self._emb_vocab_size[emb_obj_name], self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=emb_obj_name, initializer=self._param_initializer)) emb_obj_out = emb_obj_out + emb_obj if emb_obj_out else emb_obj emb_obj_out = pre_process_layer(emb_obj_out, 'nd', self._prepostprocess_dropout, name="pre_encoder") _o_seq_len = layers.shape(emb_obj_out)[1] if input_type == 'vol': assert emb_ids is not None and image_input is not None and emb_obj_ids is not None, "the input is invalid" emb_feature = fluid.layers.concat( [emb_v_out, emb_obj_out, emb_out], axis=1) elif input_type == 'vl': assert emb_ids is not None and image_input is not None and emb_obj_ids is None, "the input is invalid" emb_feature = fluid.layers.concat([emb_v_out, emb_out], axis=1) elif input_type == 'l': assert emb_ids is not None and image_input is None and emb_obj_ids is None, "the input is invalid" emb_feature = emb_out elif input_type == 'vo': assert emb_ids is None and image_input is not None and emb_obj_ids is not None, "the input is invalid" emb_feature = fluid.layers.concat([emb_v_out, emb_obj_out], axis=1) else: raise ValueError("The input type is invalid") return [emb_feature, n_head_self_attn_mask, _v_seq_len, _o_seq_len]
def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, input_mask): # padding id in vocabulary must be set to 0 emb_out = fluid.layers.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer, regularizer=fluid.regularizer.L2Decay(1e-3)), is_sparse=False) position_emb_out = fluid.layers.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer)) sent_emb_out = fluid.layers.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._sent_emb_name, initializer=self._param_initializer)) emb_out = emb_out + position_emb_out emb_out = emb_out + sent_emb_out if self._use_task_id: task_emb_out = fluid.layers.embedding( task_ids, size=[self._task_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._task_emb_name, initializer=self._param_initializer)) emb_out = emb_out + task_emb_out emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') if self._dtype is "float16": emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype) input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out = encoder( enc_input=emb_out, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, name='encoder')
def get_pretraining_output(self, mask_label, mask_pos): """Get the loss & fc_out for training""" mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') reshaped_emb_out = fluid.layers.reshape( x=self._enc_out, shape=[-1, self._emb_size]) # extract masked tokens' feature mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) # transform: fc mask_trans_feat = fluid.layers.fc( input=mask_feat, size=self._emb_size, act=self._hidden_act, param_attr=fluid.ParamAttr( name='mask_lm_trans_fc.w_0', initializer=self._param_initializer), bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) # transform: layer norm mask_trans_feat = pre_process_layer( mask_trans_feat, 'n', name='mask_lm_trans') mask_lm_out_bias_attr = fluid.ParamAttr( name="mask_lm_out_fc.b_0", initializer=fluid.initializer.Constant(value=0.0)) if self._weight_sharing: fc_out = fluid.layers.matmul( x=mask_trans_feat, y=fluid.default_main_program().global_block().var( self._word_emb_name), transpose_y=True) fc_out += fluid.layers.create_parameter( shape=[self._voc_size], dtype=self._dtype, attr=mask_lm_out_bias_attr, is_bias=True) else: fc_out = fluid.layers.fc(input=mask_trans_feat, size=self._voc_size, param_attr=fluid.ParamAttr( name="mask_lm_out_fc.w_0", initializer=self._param_initializer), bias_attr=mask_lm_out_bias_attr) #generate soft labels for loss cross entropy loss one_hot_labels = fluid.layers.one_hot( input=mask_label, depth=self._voc_size) entity_indicator = fluid.layers.fill_constant_batch_size_like( input=mask_label, shape=[-1, (self._voc_size - self._n_relation)], dtype='int64', value=0) relation_indicator = fluid.layers.fill_constant_batch_size_like( input=mask_label, shape=[-1, self._n_relation], dtype='int64', value=1) is_relation = fluid.layers.concat( input=[entity_indicator, relation_indicator], axis=-1) soft_labels = one_hot_labels * self._soft_label \ + (1.0 - one_hot_labels - is_relation) \ * ((1.0 - self._soft_label) / (self._voc_size - 1 - self._n_relation)) soft_labels.stop_gradient = True mask_lm_loss = fluid.layers.softmax_with_cross_entropy( logits=fc_out, label=soft_labels, soft_label=True) mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) return mean_mask_lm_loss, fc_out
def _build_model(self, emb=None): # padding id in vocabulary must be set to 0 if emb is None: if self.topo is None or self.topo.mp.size == 1: emb_out = fluid.layers.embedding( input=self.src_ids, size=[self._voc_size, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) else: self._word_emb_name = self._word_emb_name + '_' + str( self.topo.mp.rank) src_ids = fluid.layers.squeeze(self.src_ids, [-1]) emb_out = paddle.distributed.split( src_ids, size=(self._voc_size, self._emb_size), operation='embedding', weight_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), num_partitions=self.topo.mp.size) else: emb.stop_gradient = True emb_out = fluid.layers.gather_nd(emb, self.src_ids) emb_out.stop_gradient = False self.position_emb_out = fluid.layers.embedding( input=self.position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer)) self.sent_emb_out = fluid.layers.embedding( self.sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._sent_emb_name, initializer=self._param_initializer)) """ self.task_emb_out = fluid.layers.embedding( self.task_ids, size=[self._task_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._task_emb_name, initializer=self._param_initializer)) """ sum_emb = emb_out + self.position_emb_out sum_emb = sum_emb + self.sent_emb_out # print('[ERROR] for debuging not add task_emb out') # emb_out = emb_out + task_emb_out # for albert shold be n # for bert should be nd sum_emb = pre_process_layer(sum_emb, self.config['pre_encoder_cmd'], self._prepostprocess_dropout, name='pre_encoder', epsilon=self.config['epsilon']) if self.config['emb_mapping_in']: sum_emb = fluid.layers.fc(input=sum_emb, num_flatten_dims=2, size=self._hidden_size, param_attr=fluid.ParamAttr( name='emb_hidden_mapping', initializer=self._param_initializer), bias_attr='emb_hidden_mapping_bias') self_attn_mask = fluid.layers.matmul(x=self.input_mask, y=self.input_mask, transpose_y=True) self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out, self._checkpoints = encoder( enc_input=sum_emb, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._hidden_size // self._n_head, d_value=self._hidden_size // self._n_head, d_model=self._hidden_size, d_inner_hid=self._hidden_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd=self.config['preprocess_cmd'], postprocess_cmd=self.config['postprocess_cmd'], param_initializer=self._param_initializer, name='encoder', param_share=self._param_share, epsilon=self.config['epsilon'], n_layer_per_block=self.config['n_layer_per_block'], topo=self.topo, preln=self.preln)