Example #1
0
def prepare_encoder(src_word,
                    src_pos,
                    src_vocab_size,
                    src_emb_dim,
                    src_pad_idx,
                    src_max_len,
                    dropout=0.,
                    pos_pad_idx=0,
                    pos_enc_param_name=None):
    """Add word embeddings and position encodings.
    The output tensor has a shape of:
    [batch_size, max_src_length_in_batch, d_model].

    This module is used at the bottom of the encoder stacks.
    """
    src_word_emb = layers.embedding(
        src_word,
        size=[src_vocab_size, src_emb_dim],
        padding_idx=src_pad_idx,
        param_attr=fluid.initializer.Normal(0., 1.))
    src_pos_enc = layers.embedding(
        src_pos,
        size=[src_max_len, src_emb_dim],
        padding_idx=pos_pad_idx,
        param_attr=fluid.ParamAttr(
            name=pos_enc_param_name, trainable=False))
    enc_input = src_word_emb + src_pos_enc

    # FIXME(guosheng): Decouple the program desc with batch_size.
    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
    return layers.dropout(
        enc_input, dropout_prob=dropout,
        is_test=False) if dropout else enc_input
Example #2
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """
        Scaled Dot-Product Attention
        """

        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.

        # The current implementation of softmax_op only supports 2D tensor,
        # consequently it cannot be directly used here.
        # If to use the reshape_op, Besides, the shape of product inferred in
        # compile-time is not the actual shape in run-time. It cann't be used
        # to set the attribute of reshape_op.
        # So, here define the softmax for temporary solution.

        def __softmax(x, eps=1e-9):
            exp_out = layers.exp(x=x)
            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)

        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
        if dropout_rate:
            weights = layers.dropout(
                weights, dropout_prob=dropout_rate, is_test=False)
        out = layers.matmul(weights, v)
        return out
Example #3
0
 def call(self, step_input, states):
     new_lstm_states = []
     for i in range(self.num_layers):
         out, new_lstm_state = self.lstm_cells[i](step_input, states[i])
         step_input = layers.dropout(
             out,
             self.dropout_prob, ) if self.dropout_prob else out
         new_lstm_states.append(new_lstm_state)
     return step_input, new_lstm_states
Example #4
0
def dropout(input):
    """
    dropout
    """
    dropout_rate = 0.5
    return layers.dropout(input,
                          dropout_prob=dropout_rate,
                          dropout_implementation="upscale_in_train",
                          is_test=False)
Example #5
0
    def _gen_dec_input(self, trg_word, trg_pos, trg_slf_attn_bias,
                       trg_src_words_attn_bias, trg_src_sents_attn_bias,
                       graph_attn_bias):
        emb_out = fluid.layers.embedding(
            input=trg_word,
            size=[self.voc_size, self._emb_size],
            padding_idx=self._padding_idx,  # set embedding of pad to 0
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._word_emb_name,
                                       initializer=self._param_initializer),
            is_sparse=False)
        emb_out = layers.scale(x=emb_out, scale=self._emb_size**0.5)

        position_emb_out = fluid.layers.embedding(
            input=trg_pos,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._dec_word_pos_emb_name,
                                       trainable=False))
        position_emb_out.stop_gradient = True

        emb_out = emb_out + position_emb_out
        emb_out = layers.dropout(
            emb_out,
            dropout_prob=self._prepostprocess_dropout,
            dropout_implementation="upscale_in_train",
            is_test=False) if self._prepostprocess_dropout else emb_out

        if self._dtype is "float16":
            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
            if trg_slf_attn_bias is not None:
                trg_slf_attn_bias = fluid.layers.cast(x=trg_slf_attn_bias,
                                                      dtype=self._dtype)

            if trg_src_words_attn_bias is not None:
                trg_src_words_attn_bias = fluid.layers.cast(
                    x=trg_src_words_attn_bias, dtype=self._dtype)

            if trg_src_sents_attn_bias is not None:
                trg_src_sents_attn_bias = fluid.layers.cast(
                    x=trg_src_sents_attn_bias, dtype=self._dtype)

            if graph_attn_bias is not None:
                graph_attn_bias = fluid.layers.cast(x=graph_attn_bias,
                                                    dtype=self._dtype)

        res = namedtuple('results', [
            'emb_out', 'trg_slf_attn_bias', 'trg_src_words_attn_bias',
            'trg_src_sents_attn_bias', 'graph_attn_bias'
        ])

        return res(emb_out=emb_out,
                   trg_slf_attn_bias=trg_slf_attn_bias,
                   trg_src_words_attn_bias=trg_src_words_attn_bias,
                   trg_src_sents_attn_bias=trg_src_sents_attn_bias,
                   graph_attn_bias=graph_attn_bias)
Example #6
0
    def get_single_direction_output(rnn_input,
                                    encode_hidden,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        #print(rnn_input.shape)
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                else:
                    pre_hidden = rnn.memory(batch_ref=rnn_input,
                                            shape=[-1, hidden_size],
                                            ref_batch_dim_idx=1)
                encode_h = encode_hidden[i]
                pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1)
                new_hidden = unit_list[i](step_input, pre_encode_hidden)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
                        pre_hidden, (step_mask - 1), axis=0)
                rnn.update_memory(pre_hidden, new_hidden)

                rnn.step_output(new_hidden)

                step_input = new_hidden
                if dropout_prob is not None and dropout_prob > 0.0:
                    step_input = layers.dropout(step_input, dropout_prob=dropout_prob, )

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        all_hidden_array = []  # 增加这个来得到所有隐含状态
        rnn_output = rnn_out[-1]

        for i in range(num_layers):
            last_hidden = rnn_out[i]
            all_hidden_array.append(last_hidden)
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)

        all_hidden_array = layers.concat(all_hidden_array, axis=0)
        all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size])
        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, all_hidden_array
Example #7
0
    def __init__(self, cfg, name=None):
        super(ErnieSiameseNet, self).__init__(cfg, name=None)

        self.triplet_margin = cfg.pop("triplet_margin", 1.0)
        logging.info("triplet_margin: {}".format(self.triplet_margin))

        prob = cfg.get('classifier_dropout_prob', cfg['hidden_dropout_prob'])
        logging.info("emb dropout: {}".format(prob))

        self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i
Example #8
0
 def __init__(self, cfg, name=None):
     super(PositionwiseFeedForwardLayer, self).__init__()
     initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range'])
     d_model = cfg['hidden_size']
     d_ffn = cfg.get('intermediate_size', 4 * d_model)
     assert cfg['hidden_act'] in ['relu', 'gelu']
     self.i = _build_linear(d_model, d_ffn, append_name(name, 'fc_0'), initializer, act=cfg['hidden_act'])
     self.o = _build_linear(d_ffn, d_model, append_name(name, 'fc_1'), initializer)
     prob = cfg.get('intermediate_dropout_prob', 0.)
     self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i
Example #9
0
 def __call__(self, input):
     if self.dropout_prob > 0.0:
         return layers.dropout(input,
                               dropout_prob=self.dropout_prob,
                               is_test=self.is_test,
                               seed=self.seed,
                               name=self.name,
                               dropout_implementation='upscale_in_train')
     else:
         return input
Example #10
0
def dropout(input, test_mode, args):
    if args.dropout and (not test_mode):
        return layers.dropout(
            input,
            dropout_prob=args.dropout,
            dropout_implementation="upscale_in_train",
            seed=args.random_seed,
            is_test=False)
    else:
        return input
Example #11
0
def dropout(input, args):
    """Dropout function"""
    if args.drop_rate:
        return layers.dropout(
            input,
            dropout_prob=args.drop_rate,
            seed=args.random_seed,
            is_test=False)
    else:
        return input
Example #12
0
    def build_model(self):
        node_features = self.graph_wrapper.node_feat["feat"]

        output = self.gcn(gw=self.graph_wrapper,
                          feature=node_features,
                          hidden_size=self.hidden_size,
                          activation="relu",
                          norm=self.graph_wrapper.node_feat["norm"],
                          name="gcn_layer_1")
        output1 = output
        output = self.gcn(gw=self.graph_wrapper,
                          feature=output,
                          hidden_size=self.hidden_size,
                          activation="relu",
                          norm=self.graph_wrapper.node_feat["norm"],
                          name="gcn_layer_2")
        output2 = output
        output = self.gcn(gw=self.graph_wrapper,
                          feature=output,
                          hidden_size=self.hidden_size,
                          activation="relu",
                          norm=self.graph_wrapper.node_feat["norm"],
                          name="gcn_layer_3")

        output = L.concat(input=[output1, output2, output], axis=-1)

        output, ratio_length = sag_pool(gw=self.graph_wrapper,
                                        feature=output,
                                        ratio=self.pooling_ratio,
                                        graph_id=self.graph_id,
                                        dataset=self.args.dataset_name,
                                        name="sag_pool_1")
        output = L.lod_reset(output, self.graph_wrapper.graph_lod)
        cat1 = L.sequence_pool(output, "sum")
        ratio_length = L.cast(ratio_length, dtype="float32")
        cat1 = L.elementwise_div(cat1, ratio_length, axis=-1)
        cat2 = L.sequence_pool(output, "max")
        output = L.concat(input=[cat2, cat1], axis=-1)

        output = L.fc(output, size=self.hidden_size, act="relu")
        output = L.dropout(output, dropout_prob=self.dropout_ratio)
        output = L.fc(output, size=self.hidden_size // 2, act="relu")
        output = L.fc(output,
                      size=self.num_classes,
                      act=None,
                      param_attr=fluid.ParamAttr(name="final_fc"))

        self.labels = L.cast(self.labels, dtype="float32")
        loss = L.sigmoid_cross_entropy_with_logits(x=output, label=self.labels)
        self.loss = L.mean(loss)
        pred = L.sigmoid(output)
        self.pred = L.argmax(x=pred, axis=-1)
        correct = L.equal(self.pred, self.labels_1dim)
        correct = L.cast(correct, dtype="int32")
        self.correct = L.reduce_sum(correct)
def rnn_decoder(gru_unit,
                cue_gru_unit,
                input,
                input_size,
                hidden_size,
                num_layers,
                memory,
                memory_mask,
                knowledge,
                output_size,
                init_hidden=None,
                mask=None,
                dropout=0.0,
                batch_first=True,
                name="decoder"):
    """ rnn decoder """
    input_emb = get_embedding(input, input_size, output_size)
    if batch_first:
        input_emb = layers.transpose(input_emb, perm=[1, 0, 2])
        if mask:
            trans_mask = layers.transpose(mask, perm=[1, 0])

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(input_emb)
        step_mask = None

        if mask:
            step_mask = rnn.step_input(trans_mask)

        # split pre_hidden
        pre_hidden_list = []

        pre_hidden = rnn.memory(init=init_hidden)
        real_out, last_hidden = \
            decoder_step(gru_unit, cue_gru_unit, step_in, pre_hidden, input_size,
                         hidden_size, memory, memory_mask, knowledge, mask=step_mask)

        rnn.update_memory(pre_hidden, last_hidden)

        step_in = layers.squeeze(real_out, axes=[1])
        rnn.step_output(step_in)

    rnnout = rnn()
    rnnout = layers.transpose(rnnout, perm=[1, 0, 2])
    rnnout = layers.elementwise_mul(rnnout, mask, axis=0)

    output_in_size = hidden_size + hidden_size
    rnnout = layers.dropout(rnnout, dropout_prob=dropout)
    rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1')
    rnnout = fc(rnnout, hidden_size, output_size, name='dec_out_fc2')

    softmax_out = layers.softmax(rnnout)

    return softmax_out
Example #14
0
    def __init__(self, cfg, name=None):
        super(ErnieBlock, self).__init__()
        d_model = cfg['hidden_size']
        initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range'])

        self.attn = AttentionLayer(cfg, name = append_name(name, 'multi_head_att'))
        self.ln1 = _build_ln(d_model, name = append_name(name, 'post_att'))
        self.ffn = PositionwiseFeedForwardLayer(cfg, name = append_name(name, 'ffn'))
        self.ln2 = _build_ln(d_model, name = append_name(name, 'post_ffn'))
        prob = cfg.get('intermediate_dropout_prob', cfg['hidden_dropout_prob'])
        self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i
Example #15
0
 def call(self, step_input, states):
     new_states = []
     for i in range(self.num_layers):
         out, new_state = self.lstm_cells[i](step_input, states[i])
         step_input = layers.dropout(
             out,
             self.dropout_prob,
             dropout_implementation="upscale_in_train"
         ) if self.dropout_prob > 0 else out
         new_states.append(new_state)
     return step_input, new_states
 def forward(self, src_word, src_pos, src_slf_attn_bias):
     word_emb = self.word_embedder(src_word)
     word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5)
     pos_enc = self.pos_encoder(src_pos)
     pos_enc.stop_gradient = True
     emb = word_emb + pos_enc
     enc_input = layers.dropout(emb,
                                dropout_prob=self.emb_dropout,
                                is_test=False) if self.emb_dropout else emb
     enc_output = self.encoder(enc_input, src_slf_attn_bias)
     return enc_output
Example #17
0
 def forward(self, seq, mask=None):
     h = self.conv1d(seq)
     g, h = h[:, :, :self.num_filters], h[:, :, self.num_filters:]
     if self.dropout_rate:
         g = layers.dropout(g, dropout_prob=self.dropout_rate, dropout_implementation="upscale_in_train",
                            is_test=not self.training)
     g = layers.sigmoid(g)
     seq = g * seq + (1 - g) * h
     if mask is not None:
         seq = seq * mask
     return seq
Example #18
0
 def forward(self, step_input, states):
     new_states = []
     for i, lstm_cell in enumerate(self.lstm_cells):
         out, new_state = lstm_cell(step_input, states[i])
         step_input = layers.dropout(
             out,
             self.dropout_prob,
             dropout_implementation='upscale_in_train'
         ) if self.dropout_prob > 0 else out
         new_states.append(new_state)
     return step_input, new_states
Example #19
0
    def forward(self, x, speaker_embed=None):
        """
        Encode text sequence.
        
        Args:
            x (Variable): shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of decoder input x.
            speaker_embed (Variable, optional): shape(B, C_sp), dtype float32, speaker embeddings. This arg is not None only when the model is a multispeaker model.

        Returns:
            keys (Variable), Shape(B, T_enc, C_emb), dtype float32, the encoded epresentation for keys, where C_emb menas the text embedding size.
            values (Variable), Shape(B, T_enc, C_emb), dtype float32, the encoded representation for values.
        """
        x = self.embed(x)
        x = F.dropout(x,
                      self.dropout,
                      dropout_implementation="upscale_in_train")
        x = F.transpose(x, [0, 2, 1])

        if self.n_speakers > 1 and speaker_embed is not None:
            speaker_embed = F.dropout(
                speaker_embed,
                self.dropout,
                dropout_implementation="upscale_in_train")
            x = F.elementwise_add(x, self.sp_proj1(speaker_embed), axis=0)

        input_embed = x
        for layer in self.convolutions:
            if isinstance(layer, Conv1DGLU):
                x = layer(x, speaker_embed)
            else:
                # layer is a Conv1D with (1,) filter wrapped by WeightNormWrapper
                x = layer(x)

        if self.n_speakers > 1 and speaker_embed is not None:
            x = F.elementwise_add(x, self.sp_proj2(speaker_embed), axis=0)

        keys = x  # (B, C, T)
        values = F.scale(input_embed + x, scale=np.sqrt(0.5))
        keys = F.transpose(keys, [0, 2, 1])
        values = F.transpose(values, [0, 2, 1])
        return keys, values
Example #20
0
    def forward(self, q, k, v, lengths, speaker_embed, start_index, 
                force_monotonic=False, prev_coeffs=None, window=None):
        # add position encoding as an inductive bias 
        if self.has_bias: # multi-speaker model
            omega_q = 2 * F.sigmoid(
                F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1]))
            omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze(
                self.k_pos_affine(speaker_embed), axes=[-1]))
        else: # single-speaker case
            batch_size = q.shape[0]
            omega_q = F.ones((batch_size, ), dtype="float32")
            omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default
        q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q)
        k += self.position_encoding_weight * positional_encoding(k, 0, omega_k)

        q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v)
        activations = F.matmul(q, k, transpose_y=True)
        activations /= np.sqrt(self.attention_dim)

        if self.training:
            # mask the <pad> parts from the encoder
            mask = F.sequence_mask(lengths, dtype="float32")
            attn_bias = F.scale(1. - mask, -1000)
            activations += F.unsqueeze(attn_bias, [1])
        elif force_monotonic:
            assert window is not None
            backward_step, forward_step = window
            T_enc = k.shape[1]
            batch_size, T_dec, _ = q.shape

            # actually T_dec = 1 here
            alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
                   if prev_coeffs is None \
                   else F.argmax(prev_coeffs, axis=-1)
            backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool")
            forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool")
            mask = F.cast(F.logical_xor(backward, forward), "float32")
            # print("mask's shape:", mask.shape)
            attn_bias = F.scale(1. - mask, -1000)
            activations += attn_bias

        # softmax
        coefficients = F.softmax(activations, axis=-1)
        # context vector
        coefficients = F.dropout(coefficients, 1. - self.keep_prob,
                                 dropout_implementation='upscale_in_train')
        contexts = F.matmul(coefficients, v)
        # context normalization
        enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32")
        contexts *= F.sqrt(enc_lengths)
        # out affine
        contexts = self.out_affine(contexts)
        return contexts, coefficients
Example #21
0
    def forward(self, features):
        src_ids, sent_ids = features
        dtype = 'float16' if self.hparam['fp16'] else 'float32'
        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0
        #input_mask = L.unsqueeze(input_mask, axes=[2])
        d_shape = L.shape(src_ids)
        seqlen = d_shape[1]
        batch_size = d_shape[0]
        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
        pos_ids = L.expand(pos_ids, [batch_size, 1])
        pos_ids = L.unsqueeze(pos_ids, axes=[2])
        pos_ids = L.cast(pos_ids, 'int64')
        pos_ids.stop_gradient = True
        input_mask.stop_gradient = True
        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
        task_ids.stop_gradient = True

        bert = ErnieModel(
            src_ids=src_ids,
            position_ids=pos_ids,
            sentence_ids=sent_ids,
            task_ids=task_ids,
            input_mask=input_mask,
            config=self.hparam,
            use_fp16=self.hparam['fp16']
        )

        cls_feats = bert.get_pooled_output()

        cls_feats = L.dropout(
            x=cls_feats,
            dropout_prob=0.1,
            dropout_implementation="upscale_in_train"
        )

        logits = L.fc(
            input=cls_feats,
            size=self.hparam['num_label'],
            param_attr=F.ParamAttr(
                name="cls_out_w",
                initializer=F.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=F.ParamAttr(
                name="cls_out_b", initializer=F.initializer.Constant(0.))
        )

        propeller.summary.histogram('pred', logits)

        if self.mode is propeller.RunMode.PREDICT:
            probs = L.softmax(logits)
            return probs
        else:
            return logits
Example #22
0
def dropout(input, test_mode, args):
    dropout1 = 0.1
    test_mode = False
    random_seed = 123
    if dropout1 and (not test_mode):
        return layers.dropout(input,
                              dropout_prob=dropout1,
                              dropout_implementation="upscale_in_train",
                              seed=random_seed,
                              is_test=False)
    else:
        return input
Example #23
0
    def graph_scaled_dot_product_attention(q, k, v, attn_bias, d_key,
                                           dropout_rate, graph_attn_bias,
                                           pos_win):
        """
        Graph Scaled Dot-Product Attention
        :param q: (batch_size, n_heads, query_len, dim_per_head)
        :param k: (batch_size, n_heads, key_s_len, dim_per_head)
        :param v: (batch_size, n_heads, key_s_len, dim_per_head)
        :param attn_bias: (batch_size, n_head, query_len, key_s_len)
        :param graph_attn_bias: (batch_size, n_head, key_s_len, key_s_len)
        :return:
            proj_out: [batch, n_heads, query_len, dim_per_hed]
            weights: [batch, n_heads, query_len, key_s_len]
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(
            x=scaled_q, y=k,
            transpose_y=True)  # (batch_size, n_heads, tgt_len, n_block)
        if attn_bias:
            product += attn_bias  # (batch_size, n_heads, tgt_len, n_block)

        if graph_attn_bias:
            # re-weight the attention score with gaussian weights
            gaussian_w = __compute_graph_bias(scaled_q, graph_attn_bias,
                                              pos_win)
            product += gaussian_w  # [batch, n_heads, query_len, key_s_len]

        weights = layers.softmax(
            product)  # [batch, n_heads, query_len, key_s_len]

        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     seed=dropout_seed,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)

        out = layers.matmul(weights,
                            v)  # [batch, n_heads, query_len, dim_per_hed]

        # Project back to the model size.
        combine_out = __combine_heads_sent(
            out)  # (batch_size, query_len, emb_dim)
        proj_out = layers.fc(input=combine_out,
                             size=d_model,
                             num_flatten_dims=2,
                             param_attr=fluid.ParamAttr(
                                 name=name + '_sen_fc.w_0',
                                 initializer=param_initializer),
                             bias_attr=name + '_sen_fc.b_0')

        return proj_out, weights
Example #24
0
 def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
     scaled_q = layers.scale(x=q, scale=d_key**-0.5)
     product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
     if attn_bias:
         product += attn_bias
     weights = layers.softmax(product)
     if dropout_rate:
         weights = layers.dropout(weights,
                                  dropout_prob=dropout_rate,
                                  dropout_implementation="upscale_in_train",
                                  is_test=False)
     out = layers.matmul(weights, v)
     return out
Example #25
0
 def forward(self, x):
     """
     forward
     :param x:
     :return:
     """
     hidden = self._i2h(x)
     if self._dropout_rate:
         hidden = layers.dropout(hidden,
                                 dropout_prob=self._dropout_rate,
                                 is_test=False)
     out = self._h2o(hidden)
     return out
Example #26
0
 def call(self, step_input, states):
     lstm_states = states
     new_lstm_states = []
     step_input = layers.concat([step_input, self.latent_z], 1)
     for i in range(self.num_layers):
         out, lstm_state = self.lstm_cells[i](step_input, lstm_states[i])
         step_input = layers.dropout(
             out,
             self.dropout_prob,
             dropout_implementation="upscale_in_train"
         ) if self.dropout_prob > 0 else out
         new_lstm_states.append(lstm_state)
     return step_input, new_lstm_states
Example #27
0
 def _build_encoder(self):
     self.enc_input = layers.dropout(
         self.src_emb,
         dropout_prob=self.enc_dropout_in,
         dropout_implementation="upscale_in_train")
     enc_cell = EncoderCell(self.num_layers, self.hidden_size,
                            self.param_attr_initializer,
                            self.param_attr_scale, self.enc_dropout_out)
     enc_output, enc_final_state = rnn(
         cell=enc_cell,
         inputs=self.enc_input,
         sequence_length=self.src_sequence_length)
     return enc_output, enc_final_state
Example #28
0
 def __call__(self, input, hidden):
     assert len(hidden) == self.num_layers
     new_hidden = []
     for cell, hid in zip(self.cells, hidden):
         input, new_hid = cell(input, hid)
         new_hidden += [new_hid]
         if self.dropout > 0:
             input = layers.dropout(
                 input,
                 dropout_prob=self.dropout,
                 dropout_implementation='upscale_in_train')
     output = new_hidden[-1]
     return output, new_hidden
Example #29
0
def prepare_encoder(src_word,
                    src_pos,
                    src_vocab_size,
                    src_phone,
                    src_phone_mask,
                    phone_vocab_size,
                    src_emb_dim,
                    src_max_len,
                    beta=0.0,
                    dropout_rate=0.,
                    bos_idx=0,
                    phone_pad_idx=-1,
                    word_emb_param_name=None):
    """Add word embeddings and position encodings.
    The output tensor has a shape of:
    [batch_size, max_src_length_in_batch, d_model].
    This module is used at the bottom of the encoder stacks.
    """
    src_word_emb = layers.embedding(
        src_word,
        size=[src_vocab_size, src_emb_dim],
        padding_idx=bos_idx,  # set embedding of bos to 0
        param_attr=fluid.ParamAttr(
            name=word_emb_param_name,
            initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)

    # shape [batch_size, max_seq_len, max_phone_len, dim]
    src_phone_emb = layers.embedding(
        src_phone,
        size=[phone_vocab_size, src_emb_dim],
        padding_idx=phone_pad_idx,  # set embedding of phone_pad_idx to 0
        param_attr=fluid.ParamAttr(
            name=phone_emb_param_name,
            initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
    sum_phone_emb = layers.reduce_sum(src_phone_emb, dim=2)
    float_mask = layers.cast(src_phone_mask, dtype='float32')
    sum_mask = layers.reduce_sum(float_mask, dim=2) + 1e-9
    mean_phone_emb = layers.elementwise_div(sum_phone_emb, sum_mask, axis=0)

    src_pos_enc = layers.embedding(
        src_pos,
        size=[src_max_len, src_emb_dim],
        param_attr=fluid.ParamAttr(
            name=pos_enc_param_names[0], trainable=False))
    src_pos_enc.stop_gradient = True
    enc_input = (
        1 - beta) * src_word_emb + beta * mean_phone_emb + src_pos_enc
    return layers.dropout(
        enc_input, dropout_prob=dropout_rate, seed=dropout_seed,
        is_test=False) if dropout_rate else enc_input
Example #30
0
    def __init__(self, cfg, name=None):
        super(ErnieModelForTokenClassification, self).__init__(cfg, name=name)

        initializer = F.initializer.TruncatedNormal(
            scale=cfg['initializer_range'])
        self.classifier = _build_linear(cfg['hidden_size'], cfg['num_labels'],
                                        append_name(name, 'cls'), initializer)

        prob = cfg.get('classifier_dropout_prob', cfg['hidden_dropout_prob'])
        self.dropout = lambda i: L.dropout(
            i,
            dropout_prob=prob,
            dropout_implementation="upscale_in_train",
        ) if self.training else i
Example #31
0
 def __scaled_dot_product_attention(q, k, v, attn_bias, d_key,
                                    dropout_rate):
     """Scaled Dot-Product Attention"""
     product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5)
     if attn_bias:
         product += attn_bias
     weights = layers.softmax(product, use_cudnn=True)
     if dropout_rate:
         weights = layers.dropout(weights,
                                  dropout_prob=dropout_rate,
                                  dropout_implementation="upscale_in_train",
                                  is_test=False)
     out = layers.matmul(weights, v)
     return out
Example #32
0
 def embedding(self, t1, t2, mask=None):
     pv = self.pe(position_id(t1))
     t1 = self.ce(t1)
     t2 = self.we(self.we_p(t2))
     t = t1 + t2 + pv
     if self.dropout_rate:
         t = layers.dropout(
             t,
             dropout_prob=self.dropout_rate,
             dropout_implementation="upscale_in_train",
             is_test=not self.training)
     if mask is not None:
         t = t * mask
     return t
Example #33
0
def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
    """
    Add residual connection, layer normalization and droput to the out tensor
    optionally according to the value of process_cmd.

    This will be used before or after multi-head attention and position-wise
    feed-forward networks.
    """
    for cmd in process_cmd:
        if cmd == "a":  # add residual connection
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # add layer normalization
            out = layers.layer_norm(
                out,
                begin_norm_axis=len(out.shape) - 1,
                param_attr=fluid.initializer.Constant(1.),
                bias_attr=fluid.initializer.Constant(0.))
        elif cmd == "d":  # add dropout
            if dropout:
                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
    return out