def _build_model(self, src_ids, position_ids, sentence_ids, self_attn_mask): # padding id in vocabulary must be set to 0 emb_out = fluid.layers.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) position_emb_out = fluid.layers.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer)) sent_emb_out = fluid.layers.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._sent_emb_name, initializer=self._param_initializer)) emb_out = emb_out + position_emb_out emb_out = emb_out + sent_emb_out emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') if self._dtype == "float16": self_attn_mask = fluid.layers.cast(x=self_attn_mask, dtype=self._dtype) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out = encoder( enc_input=emb_out, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, name='encoder')
def encode(self, emb_ids=None, input_mask=None, image_input=None, emb_obj_ids=None, gather_idx=None): """unimo encoder""" if emb_ids is not None and image_input is not None and emb_obj_ids is not None: input_type = 'vol' elif emb_ids is not None and image_input is not None: input_type = 'vl' elif emb_ids is not None: input_type = 'l' elif image_input is not None and emb_obj_ids is not None: input_type = 'vo' else: raise ValueError('input feature error') emb_feature, n_head_self_attn_mask, _v_seq_len, _o_seq_len = self._gen_input( emb_ids=emb_ids, input_mask=input_mask, image_input=image_input, emb_obj_ids=emb_obj_ids, input_type=input_type) enc_out = encoder(enc_input=emb_feature, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, name='encoder', caches=self.caches, gather_idx=gather_idx) if input_type == 'vol': assert _v_seq_len is not None and _o_seq_len is not None, "the input is invalid" _vol_seq_len = layers.shape(enc_out)[1] enc_v_out = fluid.layers.slice(input=enc_out, axes=[1], starts=[0], ends=[_v_seq_len]) enc_o_out = fluid.layers.slice(input=enc_out, axes=[1], starts=[_v_seq_len], ends=[_v_seq_len + _o_seq_len]) enc_l_out = fluid.layers.slice(input=enc_out, axes=[1], starts=[_v_seq_len + _o_seq_len], ends=[_vol_seq_len]) enc_vol_out = enc_out return enc_vol_out, enc_v_out, enc_l_out elif input_type == 'vl': assert _v_seq_len is not None and _o_seq_len is None, "the input is invalid" _vl_seq_len = layers.shape(enc_out)[1] enc_v_out = fluid.layers.slice(input=enc_out, axes=[1], starts=[0], ends=[_v_seq_len]) enc_l_out = fluid.layers.slice(input=enc_out, axes=[1], starts=[_v_seq_len], ends=[_vl_seq_len]) enc_vl_out = enc_out return enc_vl_out, enc_v_out, enc_l_out elif input_type == 'vo': assert _v_seq_len is not None and _o_seq_len is not None, "the input is invalid" enc_v_out = fluid.layers.slice(input=enc_out, axes=[1], starts=[0], ends=[_v_seq_len]) return enc_v_out elif input_type == 'l': assert _v_seq_len is None and _o_seq_len is None, "the input is invalid" enc_l_out = enc_out return enc_l_out else: raise ValueError("The input type is invalid")
def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, input_mask): # padding id in vocabulary must be set to 0 emb_out = fluid.layers.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer, regularizer=fluid.regularizer.L2Decay(1e-3)), is_sparse=False) position_emb_out = fluid.layers.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer)) sent_emb_out = fluid.layers.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._sent_emb_name, initializer=self._param_initializer)) emb_out = emb_out + position_emb_out emb_out = emb_out + sent_emb_out if self._use_task_id: task_emb_out = fluid.layers.embedding( task_ids, size=[self._task_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._task_emb_name, initializer=self._param_initializer)) emb_out = emb_out + task_emb_out emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') if self._dtype is "float16": emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype) input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out = encoder( enc_input=emb_out, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, name='encoder')
def _build_model(self, emb=None): # padding id in vocabulary must be set to 0 if emb is None: if self.topo is None or self.topo.mp.size == 1: emb_out = fluid.layers.embedding( input=self.src_ids, size=[self._voc_size, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) else: self._word_emb_name = self._word_emb_name + '_' + str( self.topo.mp.rank) src_ids = fluid.layers.squeeze(self.src_ids, [-1]) emb_out = paddle.distributed.split( src_ids, size=(self._voc_size, self._emb_size), operation='embedding', weight_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), num_partitions=self.topo.mp.size) else: emb.stop_gradient = True emb_out = fluid.layers.gather_nd(emb, self.src_ids) emb_out.stop_gradient = False self.position_emb_out = fluid.layers.embedding( input=self.position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer)) self.sent_emb_out = fluid.layers.embedding( self.sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._sent_emb_name, initializer=self._param_initializer)) """ self.task_emb_out = fluid.layers.embedding( self.task_ids, size=[self._task_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( name=self._task_emb_name, initializer=self._param_initializer)) """ sum_emb = emb_out + self.position_emb_out sum_emb = sum_emb + self.sent_emb_out # print('[ERROR] for debuging not add task_emb out') # emb_out = emb_out + task_emb_out # for albert shold be n # for bert should be nd sum_emb = pre_process_layer(sum_emb, self.config['pre_encoder_cmd'], self._prepostprocess_dropout, name='pre_encoder', epsilon=self.config['epsilon']) if self.config['emb_mapping_in']: sum_emb = fluid.layers.fc(input=sum_emb, num_flatten_dims=2, size=self._hidden_size, param_attr=fluid.ParamAttr( name='emb_hidden_mapping', initializer=self._param_initializer), bias_attr='emb_hidden_mapping_bias') self_attn_mask = fluid.layers.matmul(x=self.input_mask, y=self.input_mask, transpose_y=True) self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out, self._checkpoints = encoder( enc_input=sum_emb, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._hidden_size // self._n_head, d_value=self._hidden_size // self._n_head, d_model=self._hidden_size, d_inner_hid=self._hidden_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd=self.config['preprocess_cmd'], postprocess_cmd=self.config['postprocess_cmd'], param_initializer=self._param_initializer, name='encoder', param_share=self._param_share, epsilon=self.config['epsilon'], n_layer_per_block=self.config['n_layer_per_block'], topo=self.topo, preln=self.preln)