Ejemplo n.º 1
0
Archivo: ernie.py Proyecto: zhanzq/LARK
    def _build_model(self, src_ids, position_ids, sentence_ids,
                     self_attn_mask):
        # padding id in vocabulary must be set to 0
        emb_out = fluid.layers.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(name=self._word_emb_name,
                                       initializer=self._param_initializer),
            is_sparse=False)
        position_emb_out = fluid.layers.embedding(
            input=position_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(name=self._pos_emb_name,
                                       initializer=self._param_initializer))

        sent_emb_out = fluid.layers.embedding(
            sentence_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(name=self._sent_emb_name,
                                       initializer=self._param_initializer))

        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out

        emb_out = pre_process_layer(emb_out,
                                    'nd',
                                    self._prepostprocess_dropout,
                                    name='pre_encoder')

        if self._dtype == "float16":
            self_attn_mask = fluid.layers.cast(x=self_attn_mask,
                                               dtype=self._dtype)

        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] *
                                                   self._n_head,
                                                   axis=1)
        n_head_self_attn_mask.stop_gradient = True

        self._enc_out = encoder(
            enc_input=emb_out,
            attn_bias=n_head_self_attn_mask,
            n_layer=self._n_layer,
            n_head=self._n_head,
            d_key=self._emb_size // self._n_head,
            d_value=self._emb_size // self._n_head,
            d_model=self._emb_size,
            d_inner_hid=self._emb_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=0,
            hidden_act=self._hidden_act,
            preprocess_cmd="",
            postprocess_cmd="dan",
            param_initializer=self._param_initializer,
            name='encoder')
Ejemplo n.º 2
0
    def get_pretraining_output(self, mask_label, mask_pos, labels):
        """Get the loss & accuracy for pretraining"""

        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')

        # extract the first token feature in each sentence
        next_sent_feat = self.get_pooled_output()
        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
        # extract masked tokens' feature
        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)

        # transform: fc
        mask_trans_feat = fluid.layers.fc(input=mask_feat,
                                          size=self._emb_size,
                                          act=self._hidden_act,
                                          param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
                                                                     initializer=self._param_initializer),
                                          bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
        # transform: layer norm
        mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')

        mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
                                                initializer=fluid.initializer.Constant(value=0.0))
        if self._weight_sharing:
            fc_out = fluid.layers.matmul(x=mask_trans_feat,
                                         y=fluid.default_main_program().global_block().var(self._word_emb_name),
                                         transpose_y=True)
            fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
                                                    dtype=self._dtype,
                                                    attr=mask_lm_out_bias_attr,
                                                    is_bias=True)

        else:
            fc_out = fluid.layers.fc(input=mask_trans_feat,
                                     size=self._voc_size,
                                     param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
                                                                initializer=self._param_initializer),
                                     bias_attr=mask_lm_out_bias_attr)

        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)

        next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
                                           size=2,
                                           param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
                                                                      initializer=self._param_initializer),
                                           bias_attr="next_sent_fc.b_0")

        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
                                                                                    label=labels,
                                                                                    return_softmax=True)

        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)

        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)

        loss = mean_next_sent_loss + mean_mask_lm_loss
        return next_sent_acc, mean_mask_lm_loss, loss
Ejemplo n.º 3
0
    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
                     input_mask):
        # padding id in vocabulary must be set to 0
        emb_out = fluid.layers.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._word_emb_name,
                                       initializer=self._param_initializer),
            is_sparse=False)

        # padding id in vocabulary must be set to 0
        if self._dtype == core.VarDesc.VarType.FP16:
            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)

        # shape as [-1, 1, max_seq_len, embedding]
        emb_out = pre_process_layer(emb_out,
                                    'nd',
                                    self._prepostprocess_dropout,
                                    name='pre_encoder')
        for i in range(self._n_layer):
            prev_out = emb_out
            if (i == 0) and ("CACHE_CNN" in os.environ):
                emb_out = self.cache_cnn(src_ids)
            else:
                prev_out = emb_out
                emb_out = fluid.layers.unsqueeze(emb_out, 1)
                emb_out = fluid.layers.conv2d(
                    emb_out,
                    num_filters=self._emb_size,
                    filter_size=(1, 3),
                    act=self._hidden_act,
                    data_format="NHWC",
                    padding=str("SAME"),
                    param_attr=fluid.ParamAttr(
                        name='conv_%s_fc.w_0' % i,
                        initializer=self._param_initializer),
                    bias_attr='conv_%s_fc.b_0' % i)
                emb_out = fluid.layers.squeeze(emb_out, [1])

            emb_out = pre_post_process_layer(prev_out,
                                             emb_out,
                                             "dan",
                                             self._prepostprocess_dropout,
                                             name="conv_post_%s" % i)

        self._enc_out = emb_out

        if self._dtype == core.VarDesc.VarType.FP16:
            self._enc_out = fluid.layers.cast(x=self._enc_out,
                                              dtype=self._emb_dtype)
Ejemplo n.º 4
0
    def _gen_input(self,
                   emb_ids=None,
                   input_mask=None,
                   image_input=None,
                   emb_obj_ids=None,
                   input_type=None):
        assert input_mask is not None, "input_mask should not be none"
        assert input_type is not None, "input_type should not be none"

        self_attn_mask = input_mask
        self_attn_mask = fluid.layers.scale(x=self_attn_mask,
                                            scale=1e4,
                                            bias=-1.0,
                                            bias_after_scale=False)
        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] *
                                                   self._n_head,
                                                   axis=1)
        n_head_self_attn_mask.stop_gradient = True
        emb_feature, _v_seq_len, _o_seq_len = None, None, None

        if emb_ids is not None:
            emb_out = None
            # text part
            for emb_name, emb_id in emb_ids.items():
                if emb_name == "sent_embedding":
                    continue  # don't use sentence embedding
                emb = fluid.layers.embedding(
                    input=emb_id,
                    size=[self._emb_vocab_size[emb_name], self._emb_size],
                    dtype=self._emb_dtype,
                    param_attr=fluid.ParamAttr(
                        name=emb_name, initializer=self._param_initializer))
                emb_out = emb_out + emb if emb_out else emb

            if self.text_adv_delta is not None:
                emb_out = emb_out + self.text_adv_delta

            emb_out = pre_process_layer(emb_out,
                                        'nd',
                                        self._prepostprocess_dropout,
                                        name="pre_encoder")

        if image_input is not None:
            # visual part
            if self.image_adv_delta is not None:
                emb_v_in = image_input[self._image_emb_name]
                emb_v_in = emb_v_in + self.image_adv_delta
            else:
                emb_v_in = image_input[self._image_emb_name]

            image_embeddings = fluid.layers.fc(
                emb_v_in,  # [batch_size, 37, 2048]
                self._emb_size,
                param_attr=fluid.ParamAttr(
                    name="image_emb.w_0", initializer=self._param_initializer),
                bias_attr="image_emb.b_0",
                num_flatten_dims=2)

            loc_emb_out = fluid.layers.fc(
                image_input[self._loc_emb_name],  # [batch_size, 37, 5]
                self._emb_size,
                param_attr=fluid.ParamAttr(
                    name="image_loc.w_0", initializer=self._param_initializer),
                bias_attr="image_loc.b_0",
                num_flatten_dims=2)

            emb_v_out = image_embeddings + loc_emb_out
            emb_v_out = pre_process_layer(emb_v_out,
                                          'nd',
                                          self._prepostprocess_dropout,
                                          name='v_pre_encoder')

            _v_seq_len = layers.shape(emb_v_out)[1]

        if emb_obj_ids is not None:
            emb_obj_out = None
            # text part
            for emb_obj_name, emb_obj_id in emb_obj_ids.items():
                if emb_obj_name == "sent_embedding":
                    continue  # don't use sentence embedding in roberta
                emb_obj = fluid.layers.embedding(
                    input=emb_obj_id,
                    size=[self._emb_vocab_size[emb_obj_name], self._emb_size],
                    dtype=self._emb_dtype,
                    param_attr=fluid.ParamAttr(
                        name=emb_obj_name,
                        initializer=self._param_initializer))
                emb_obj_out = emb_obj_out + emb_obj if emb_obj_out else emb_obj

            emb_obj_out = pre_process_layer(emb_obj_out,
                                            'nd',
                                            self._prepostprocess_dropout,
                                            name="pre_encoder")
            _o_seq_len = layers.shape(emb_obj_out)[1]

        if input_type == 'vol':
            assert emb_ids is not None and image_input is not None and emb_obj_ids is not None, "the input is invalid"
            emb_feature = fluid.layers.concat(
                [emb_v_out, emb_obj_out, emb_out], axis=1)
        elif input_type == 'vl':
            assert emb_ids is not None and image_input is not None and emb_obj_ids is None, "the input is invalid"
            emb_feature = fluid.layers.concat([emb_v_out, emb_out], axis=1)
        elif input_type == 'l':
            assert emb_ids is not None and image_input is None and emb_obj_ids is None, "the input is invalid"
            emb_feature = emb_out
        elif input_type == 'vo':
            assert emb_ids is None and image_input is not None and emb_obj_ids is not None, "the input is invalid"
            emb_feature = fluid.layers.concat([emb_v_out, emb_obj_out], axis=1)
        else:
            raise ValueError("The input type is invalid")

        return [emb_feature, n_head_self_attn_mask, _v_seq_len, _o_seq_len]
Ejemplo n.º 5
0
    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
                     input_mask):
        # padding id in vocabulary must be set to 0
        emb_out = fluid.layers.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(
                name=self._word_emb_name,
                initializer=self._param_initializer,
                regularizer=fluid.regularizer.L2Decay(1e-3)),
            is_sparse=False)

        position_emb_out = fluid.layers.embedding(
            input=position_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._pos_emb_name,
                                       initializer=self._param_initializer))

        sent_emb_out = fluid.layers.embedding(
            sentence_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._sent_emb_name,
                                       initializer=self._param_initializer))

        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out

        if self._use_task_id:
            task_emb_out = fluid.layers.embedding(
                task_ids,
                size=[self._task_types, self._emb_size],
                dtype=self._emb_dtype,
                param_attr=fluid.ParamAttr(
                    name=self._task_emb_name,
                    initializer=self._param_initializer))

            emb_out = emb_out + task_emb_out

        emb_out = pre_process_layer(emb_out,
                                    'nd',
                                    self._prepostprocess_dropout,
                                    name='pre_encoder')

        if self._dtype is "float16":
            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
        self_attn_mask = fluid.layers.matmul(x=input_mask,
                                             y=input_mask,
                                             transpose_y=True)

        self_attn_mask = fluid.layers.scale(x=self_attn_mask,
                                            scale=10000.0,
                                            bias=-1.0,
                                            bias_after_scale=False)
        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] *
                                                   self._n_head,
                                                   axis=1)
        n_head_self_attn_mask.stop_gradient = True

        self._enc_out = encoder(
            enc_input=emb_out,
            attn_bias=n_head_self_attn_mask,
            n_layer=self._n_layer,
            n_head=self._n_head,
            d_key=self._emb_size // self._n_head,
            d_value=self._emb_size // self._n_head,
            d_model=self._emb_size,
            d_inner_hid=self._emb_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=0,
            hidden_act=self._hidden_act,
            preprocess_cmd="",
            postprocess_cmd="dan",
            param_initializer=self._param_initializer,
            name='encoder')
Ejemplo n.º 6
0
    def get_pretraining_output(self, mask_label, mask_pos):
        """Get the loss & fc_out for training"""
        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')

        reshaped_emb_out = fluid.layers.reshape(
            x=self._enc_out, shape=[-1, self._emb_size])
        # extract masked tokens' feature
        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)

        # transform: fc
        mask_trans_feat = fluid.layers.fc(
            input=mask_feat,
            size=self._emb_size,
            act=self._hidden_act,
            param_attr=fluid.ParamAttr(
                name='mask_lm_trans_fc.w_0',
                initializer=self._param_initializer),
            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
        # transform: layer norm
        mask_trans_feat = pre_process_layer(
            mask_trans_feat, 'n', name='mask_lm_trans')

        mask_lm_out_bias_attr = fluid.ParamAttr(
            name="mask_lm_out_fc.b_0",
            initializer=fluid.initializer.Constant(value=0.0))
        if self._weight_sharing:
            fc_out = fluid.layers.matmul(
                x=mask_trans_feat,
                y=fluid.default_main_program().global_block().var(
                    self._word_emb_name),
                transpose_y=True)
            fc_out += fluid.layers.create_parameter(
                shape=[self._voc_size],
                dtype=self._dtype,
                attr=mask_lm_out_bias_attr,
                is_bias=True)
        else:
            fc_out = fluid.layers.fc(input=mask_trans_feat,
                                     size=self._voc_size,
                                     param_attr=fluid.ParamAttr(
                                         name="mask_lm_out_fc.w_0",
                                         initializer=self._param_initializer),
                                     bias_attr=mask_lm_out_bias_attr)
        #generate soft labels for loss cross entropy loss
        one_hot_labels = fluid.layers.one_hot(
            input=mask_label, depth=self._voc_size)
        entity_indicator = fluid.layers.fill_constant_batch_size_like(
            input=mask_label,
            shape=[-1, (self._voc_size - self._n_relation)],
            dtype='int64',
            value=0)
        relation_indicator = fluid.layers.fill_constant_batch_size_like(
            input=mask_label,
            shape=[-1, self._n_relation],
            dtype='int64',
            value=1)
        is_relation = fluid.layers.concat(
            input=[entity_indicator, relation_indicator], axis=-1)
        soft_labels = one_hot_labels * self._soft_label \
                      + (1.0 - one_hot_labels - is_relation) \
                      * ((1.0 - self._soft_label) / (self._voc_size - 1 - self._n_relation))
        soft_labels.stop_gradient = True

        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
            logits=fc_out, label=soft_labels, soft_label=True)
        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)

        return mean_mask_lm_loss, fc_out
Ejemplo n.º 7
0
    def _build_model(self, emb=None):
        # padding id in vocabulary must be set to 0
        if emb is None:
            if self.topo is None or self.topo.mp.size == 1:
                emb_out = fluid.layers.embedding(
                    input=self.src_ids,
                    size=[self._voc_size, self._emb_size],
                    dtype=self._emb_dtype,
                    param_attr=fluid.ParamAttr(
                        name=self._word_emb_name,
                        initializer=self._param_initializer),
                    is_sparse=False)
            else:
                self._word_emb_name = self._word_emb_name + '_' + str(
                    self.topo.mp.rank)
                src_ids = fluid.layers.squeeze(self.src_ids, [-1])
                emb_out = paddle.distributed.split(
                    src_ids,
                    size=(self._voc_size, self._emb_size),
                    operation='embedding',
                    weight_attr=fluid.ParamAttr(
                        name=self._word_emb_name,
                        initializer=self._param_initializer),
                    num_partitions=self.topo.mp.size)
        else:
            emb.stop_gradient = True
            emb_out = fluid.layers.gather_nd(emb, self.src_ids)
            emb_out.stop_gradient = False

        self.position_emb_out = fluid.layers.embedding(
            input=self.position_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._pos_emb_name,
                                       initializer=self._param_initializer))

        self.sent_emb_out = fluid.layers.embedding(
            self.sentence_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._sent_emb_name,
                                       initializer=self._param_initializer))
        """
        self.task_emb_out = fluid.layers.embedding(
            self.task_ids,
            size=[self._task_types, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(
                name=self._task_emb_name, initializer=self._param_initializer))
        """
        sum_emb = emb_out + self.position_emb_out
        sum_emb = sum_emb + self.sent_emb_out
        # print('[ERROR] for debuging not add task_emb out')
        # emb_out = emb_out + task_emb_out

        # for albert shold be n
        # for bert should be nd
        sum_emb = pre_process_layer(sum_emb,
                                    self.config['pre_encoder_cmd'],
                                    self._prepostprocess_dropout,
                                    name='pre_encoder',
                                    epsilon=self.config['epsilon'])

        if self.config['emb_mapping_in']:
            sum_emb = fluid.layers.fc(input=sum_emb,
                                      num_flatten_dims=2,
                                      size=self._hidden_size,
                                      param_attr=fluid.ParamAttr(
                                          name='emb_hidden_mapping',
                                          initializer=self._param_initializer),
                                      bias_attr='emb_hidden_mapping_bias')

        self_attn_mask = fluid.layers.matmul(x=self.input_mask,
                                             y=self.input_mask,
                                             transpose_y=True)

        self_attn_mask = fluid.layers.scale(x=self_attn_mask,
                                            scale=10000.0,
                                            bias=-1.0,
                                            bias_after_scale=False)
        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] *
                                                   self._n_head,
                                                   axis=1)
        n_head_self_attn_mask.stop_gradient = True

        self._enc_out, self._checkpoints = encoder(
            enc_input=sum_emb,
            attn_bias=n_head_self_attn_mask,
            n_layer=self._n_layer,
            n_head=self._n_head,
            d_key=self._hidden_size // self._n_head,
            d_value=self._hidden_size // self._n_head,
            d_model=self._hidden_size,
            d_inner_hid=self._hidden_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=0,
            hidden_act=self._hidden_act,
            preprocess_cmd=self.config['preprocess_cmd'],
            postprocess_cmd=self.config['postprocess_cmd'],
            param_initializer=self._param_initializer,
            name='encoder',
            param_share=self._param_share,
            epsilon=self.config['epsilon'],
            n_layer_per_block=self.config['n_layer_per_block'],
            topo=self.topo,
            preln=self.preln)