Ejemplo n.º 1
0
 def test_softmax(self):
     program = Program()
     with program_guard(program):
         data = layers.data(name='data', shape=[10], dtype='float32')
         hid = layers.fc(input=data, size=20)
         self.assertIsNotNone(layers.softmax(hid))
     print(str(program))
Ejemplo n.º 2
0
 def scaled_dot_product_attention(q,
                                  k,
                                  v,
                                  attn_bias,
                                  d_key,
                                  dropout_rate,
                                  is_test=False):
     """
     Scaled Dot-Product Attention
     """
     scaled_q = layers.scale(x=q, scale=d_key**-0.5)
     product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
     if attn_bias:
         product += attn_bias
     weights = layers.softmax(product, use_cudnn=True)
     if dropout_rate:
         weights = layers.dropout(weights,
                                  dropout_prob=dropout_rate,
                                  dropout_implementation="upscale_in_train",
                                  is_test=is_test)
     out = layers.matmul(weights, v)
     return out
Ejemplo n.º 3
0
    def forward(self, inputs, labels=None, logits_softmax=False):
        """前向预测
        """
        emb = self.embedding(inputs)

        hid_fc1 = self._hid_fc1(emb)

        gru_forward = self._gru_forward(hid_fc1)
        gru_forward_tanh = L.tanh(gru_forward)
        if self.bi_direction:
            gru_backward = self._gru_backward(hid_fc1)
            gru_backward_tanh = L.tanh(gru_backward)
            encoded_vector = L.concat(
                input=[gru_forward_tanh, gru_backward_tanh], axis=2)
            encoded_vector = L.reduce_max(encoded_vector, dim=1)
        else:
            encoded_vector = L.reduce_max(gru_forward_tanh, dim=1)

        hid_fc_2 = self._hid_fc2(encoded_vector)

        logits = self._output_fc(hid_fc_2)

        # 输出logits为softmax后的结果
        if logits_softmax:
            logits = L.softmax(logits)

        # 如果没有给标签 则输出logits结果
        if labels is None:
            return logits

        if len(labels.shape) == 1:
            labels = L.reshape(labels, [-1, 1])
        #print("labels shape: {}".format(labels.shape))

        loss = L.softmax_with_cross_entropy(logits, labels)
        # 如果输出logits的激活函数为softmax 则不能用softmax_with_cross_entropy
        #loss = L.cross_entropy(logits, labels)
        loss = L.reduce_mean(loss)
        return loss, logits
    def forward(self, *args, **kwargs):
        """
        Args:
            logits_softmax (optional, boolean):
                if true, return logits after softmax
        Returns:
            loss (`Variable` of shape []):
                Cross entropy loss mean over batch
                if labels not set, doesn't return
            logits (`Variable` of shape [batch_size, hidden_size]):
                output logits of classifier
        """
        logits_softmax = kwargs.pop("logits_softmax", False)
        loss, logits = super(ErnieModelCustomized, self).forward(*args,
                **kwargs)
        if logits_softmax:
            logits = L.softmax(logits, use_cudnn=True)

        if loss is None:
            return logits
        else:
            return loss, logits
Ejemplo n.º 5
0
    def compute_mog_loss(self, y, t):
        """compute the loss where output distribution is a mixture of Gaussians.

        Args:
            y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.
            t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.

        Returns:
            Variable: shape(1, ), dtype float32, the loss.
        """
        n_mixture = self.output_dim // 3

        # context size is not taken in to account
        y = y[:, self.context_size:, :]
        t = t[:, self.context_size:]

        w, mu, log_std = F.split(y, 3, dim=2)
        # 100.0 is just a large float
        log_std = F.clip(log_std, min=self.log_scale_min, max=100.)
        inv_std = F.exp(-log_std)
        p_mixture = F.softmax(w, axis=-1)

        t = F.unsqueeze(t, axes=[-1])
        if n_mixture > 1:
            # t = F.expand_as(t, log_std)
            t = F.expand(t, [1, 1, n_mixture])

        x_std = inv_std * (t - mu)
        exponent = F.exp(-0.5 * x_std * x_std)
        pdf_x = 1.0 / math.sqrt(2.0 * math.pi) * inv_std * exponent

        pdf_x = p_mixture * pdf_x
        # pdf_x: [bs, len]
        pdf_x = F.reduce_sum(pdf_x, dim=-1)
        per_sample_loss = -F.log(pdf_x + 1e-9)

        loss = F.reduce_mean(per_sample_loss)
        return loss
Ejemplo n.º 6
0
    def forward(self, inputs, labels=None, logits_softmax=False):
        """前向预测
        """
        #print("\n".join(map(lambda ids: "/ ".join([id_2_token[x] for x in ids]), inputs.numpy())))
        # inputs shape = [batch_size, seq_len]
        #print("inputs shape: {}".format(inputs.shape))

        # emb shape = [batch_size, seq_len, emb_dim]
        emb = self.embedding(inputs)
        #print("emb shape: {}".format(emb.shape))

        conv_pool_res = self.textcnn(emb)

        hid_fc = self._hid_fc(conv_pool_res)
        #print("hid_fc shape: {}".format(hid_fc.shape))

        logits = self._output_fc(hid_fc)
        #print("logits shape: {}".format(logits.shape))

        # 输出logits为softmax后的结果
        if logits_softmax:
            logits = L.softmax(logits)

        # 如果没有给标签 则输出logits结果
        if labels is None:
            return logits

        # 调整label的形状
        if len(labels.shape) == 1:
            labels = L.reshape(labels, [-1, 1])
        #logging.info("labels shape: {}".format(labels.shape))

        loss = L.softmax_with_cross_entropy(logits, labels)
        # 如果输出logits的激活函数为softmax 则不能用softmax_with_cross_entropy
        #loss = L.cross_entropy(logits, labels)
        loss = L.reduce_mean(loss)
        #acc = L.accuracy(input=prediction, label=label)
        return loss, logits
def _dot_product_relative(q,
                          k,
                          v,
                          bias,
                          dropout=0.1,
                          cache=None,
                          params_type="normal"):
    depth_constant = int(k.shape[3])
    heads = layers.shape(k)[1]
    length = layers.shape(k)[2]

    max_relative_position = 4
    pre_name = "relative_positions_"
    if params_type == "fixed":
        pre_name = "fixed_relative_positions_"
    elif params_type == "new":
        pre_name = "new_relative_positions_"
    relations_keys = generate_relative_positions_embeddings(
        length,
        depth_constant,
        max_relative_position,
        name=pre_name + "keys",
        cache=cache is not None)

    relations_values = generate_relative_positions_embeddings(
        length,
        depth_constant,
        max_relative_position,
        name=pre_name + "values",
        cache=cache is not None)

    logits = _relative_attention_inner(q, k, relations_keys, True)

    if bias is not None: logits += bias
    weights = layers.softmax(logits, name="attention_weights")
    weights = layers.dropout(weights, dropout_prob=float(dropout))
    output = _relative_attention_inner(weights, v, relations_values, False)
    return output
 def attention(self, hidden, encoder_output, encoder_output_proj,
               encoder_padding_mask):
     # 定义attention用以计算context,即 c_i,这里使用Bahdanau attention机制
     decoder_state_proj = layers.unsqueeze(
         layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1])
     mixed_state = fluid.layers.elementwise_add(
         encoder_output_proj,
         layers.expand(decoder_state_proj,
                       [1, layers.shape(decoder_state_proj)[1], 1]))
     attn_scores = layers.squeeze(
         layers.fc(input=mixed_state,
                   size=1,
                   num_flatten_dims=2,
                   bias_attr=False), [2])
     if encoder_padding_mask is not None:
         attn_scores = layers.elementwise_add(attn_scores,
                                              encoder_padding_mask)
     attn_scores = layers.softmax(attn_scores)
     context = layers.reduce_sum(layers.elementwise_mul(encoder_output,
                                                        attn_scores,
                                                        axis=0),
                                 dim=1)
     return context
Ejemplo n.º 9
0
    def forward(self, x):
        b, c, h, w = x.shape

        f_query = self.conv_query(x)
        f_query = reshape(f_query, (b, -1, h * w))
        f_query = transpose(f_query, (0, 2, 1))

        f_key = self.conv_key(x)
        f_key = reshape(f_key, (b, -1, h * w))

        f_value = self.conv_value(x)
        f_value = reshape(f_value, (b, -1, h * w))
        f_value = transpose(f_value, (0, 2, 1))

        f_similarity = bmm(f_query, f_key)  # [h*w, h*w]
        f_similarity = softmax(f_similarity)
        f_similarity = transpose(f_similarity, (0, 2, 1))

        f_attention = bmm(f_similarity, f_value)  # [h*w, c]
        f_attention = reshape(f_attention, (b, c, h, w))

        out = self.gamma * f_attention + x
        return out
Ejemplo n.º 10
0
    def forward(self, queries, keys, values, attn_bias, cache=None):
        # compute q ,k ,v
        q, k, v = self._prepare_qkv(queries, keys, values, cache)

        # scale dot product attention
        product = layers.matmul(
            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
        if attn_bias is not None:
            product += attn_bias
        weights = layers.softmax(product)
        if self.dropout_rate:
            weights = layers.dropout(
                weights, dropout_prob=self.dropout_rate, is_test=False)

        out = layers.matmul(weights, v)

        # combine heads
        out = layers.transpose(out, perm=[0, 2, 1, 3])
        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.proj_fc(out)
        return out
Ejemplo n.º 11
0
    def forward(self, queries, keys, values, attn_bias, past_cache):
        assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3
        # bsz, q_len, q_dim = queries.shape
        # bsz, k_len, k_dim = keys.shape
        # bsz, v_len, v_dim = values.shape
        # assert k_len == v_len

        q = self.q(queries)
        k = self.k(keys)
        v = self.v(values)

        cache = (k, v)
        if past_cache is not None:
            cached_k, cached_v = past_cache
            k = L.concat([cached_k, k], 1)
            v = L.concat([cached_v, v], 1)

        q = L.transpose(L.reshape(q, [0, 0, self.n_head, q.shape[-1] // self.n_head]),
                        [0, 2, 1, 3])  # [batch, head, seq, dim]
        k = L.transpose(L.reshape(k, [0, 0, self.n_head, k.shape[-1] // self.n_head]),
                        [0, 2, 1, 3])  # [batch, head, seq, dim]
        v = L.transpose(L.reshape(v, [0, 0, self.n_head, v.shape[-1] // self.n_head]),
                        [0, 2, 1, 3])  # [batch, head, seq, dim]

        q = L.scale(q, scale=self.d_key ** -0.5)
        score = L.matmul(q, k, transpose_y=True)
        if attn_bias is not None:
            score += attn_bias
        score = L.softmax(score, use_cudnn=True)
        score = self.dropout(score)

        out = L.matmul(score, v)
        out = L.transpose(out, [0, 2, 1, 3])
        out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]])

        out = self.o(out)
        return out, cache
Ejemplo n.º 12
0
    def forward(self, queries, keys, values, attn_bias, cache=None):
        # compute q ,k ,v
        keys = queries if keys is None else keys
        values = keys if values is None else values
        q = self.q_fc(queries)
        k = self.k_fc(keys)
        v = self.v_fc(values)
        # split head
        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
        v = layers.transpose(x=v, perm=[0, 2, 1, 3])

        if cache is not None:
            cache_k, cache_v = cache["k"], cache["v"]
            k = layers.concat([cache_k, k], axis=2)
            v = layers.concat([cache_v, v], axis=2)
            cache["k"], cache["v"] = k, v
        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.d_model**-0.5)
        if attn_bias is not None:
            product += attn_bias
        weights = layers.softmax(product)
        if self.dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=self.dropout_rate,
                                     is_test=False)
            out = layers.matmul(weights, v)
        out = layers.transpose(out, perm=[0, 2, 1, 3])
        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
        out = self.proj_fc(out)
        return out
Ejemplo n.º 13
0
        def scaled_dot_product_attention(q, k, v, attn_bias, d_key,
                                         dropout_rate):
            """
            Scaled Dot-Product Attention
            """
            scaled_q = layers.scale(x=q, scale=d_key**-0.5)
            product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
            if attn_bias:
                product += attn_bias
            weights = layers.softmax(product)

            # memorize the weights in block
            self.model['blocks'][curr_block_id]['multi_head_attention'][
                'softmax'] = weights

            if dropout_rate and self.is_training:
                weights = layers.dropout(
                    weights,
                    dropout_prob=dropout_rate,
                    dropout_implementation="upscale_in_train",
                    is_test=False)

            out = layers.matmul(weights, v)
            return out
    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        q: (-1, 16, 80, 64)
        k: (-1, 16, 80, 64)
        v: (-1, 16, 80, 64)
        attn_bias: (-1, 16, 80, 80)
        """
        scaled_q = layers.scale(x=q, scale=d_key ** -0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)

        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        if dropout_rate:
            weights = layers.dropout(
                weights,
                dropout_prob=dropout_rate,
                dropout_implementation="upscale_in_train",
                seed=seed,
                is_test=False)
        out = layers.matmul(weights, v)
        # out: (-1, 16, 80, 64)
        return out
Ejemplo n.º 15
0
    def forward(self, outputs, target_sizes):
        """
        Perform the computation
        Parameters:
            outputs: raw outputs of the model
            target_sizes: tensor of dimension [batch_size x 2] containing the size of each image
                          For evaluation, this must be the original image size (before any data augmentation)
                          For visualization, this should be the image size after data augment, but before padding
        """
        out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"]

        assert len(out_logits) == len(target_sizes)
        assert target_sizes.shape[1] == 2

        prob = L.softmax(out_logits, -1)  # [bs, num_queries, num_classes + 1]
        labels = L.argmax(prob[:, :, :], axis=-1)  # [bs, num_queries]
        scores = L.reduce_max(prob, dim=-1)  # [bs, num_queries]

        # convert to [x0, y0, x1, y1] format
        bs, num_queries, _ = out_bbox.shape
        out_bbox = L.reshape(out_bbox, (-1, 4))
        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
        boxes = L.reshape(boxes, (bs, num_queries, 4))
        # and fromm relative [0, 1] to absolute [0, height] coordinates
        img_h, img_w = target_sizes[:, 0], target_sizes[:, 1]
        scale_fct = L.stack([img_w, img_h, img_w, img_h], 1)  # [bs, 4]
        scale_fct = L.expand(L.unsqueeze(scale_fct, [1]), (1, num_queries, 1))
        boxes = boxes * scale_fct

        results = [{
            'scores': s,
            'labels': l,
            'boxes': b
        } for s, l, b in zip(scores.numpy(), labels.numpy(), boxes.numpy())]

        return results
Ejemplo n.º 16
0
    def forward(self, queries, keys=None, values=None, mask=None):
        keys = queries if keys is None else keys
        values = keys if values is None else values
        q = self.q_proj(queries)
        k = self.q_proj(keys)
        v = self.q_proj(values)
        q = layers.transpose(layers.reshape(q, shape=[0, 0, self.n_head, self.d_key]), [0, 2, 1, 3])
        k = layers.transpose(layers.reshape(k, shape=[0, 0, self.n_head, self.d_key]), [0, 2, 1, 3])
        v = layers.transpose(layers.reshape(v, shape=[0, 0, self.n_head, self.d_value]), [0, 2, 1, 3])
        scaled_q = layers.scale(x=q, scale=self.d_key ** -0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        if mask is not None:
            product -= (1 - layers.transpose(layers.unsqueeze(mask, 1), [0, 1, 3, 2])) * 1e10
        weights = layers.softmax(product)
        if self.dropout_rate:
            weights = layers.dropout(
                weights,
                dropout_prob=self.dropout_rate,
                dropout_implementation="upscale_in_train",
                is_test=not self.training)
        out = layers.matmul(weights, v)
        out = layers.reshape(layers.transpose(out, [0, 2, 1, 3]), [0, 0, self.d_value * self.n_head])

        return out
def knowledge_seq2seq(config):
    """ knowledge seq2seq """
    emb_size = config.embed_size
    hidden_size = config.hidden_size
    input_size = emb_size
    num_layers = config.num_layers
    bi_direc = config.bidirectional
    batch_size = config.batch_size
    vocab_size = config.vocab_size
    run_type = config.run_type

    enc_input = layers.data(name="enc_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #enc_input --> goal
    enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32')
    goal_input = layers.data(name="goal_input",
                             shape=[1],
                             dtype='int64',
                             lod_level=1)  #goal_input --> x
    cue_input = layers.data(name="cue_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #cue_input --> kg
    #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32')
    memory_mask = layers.data(name='memory_mask',
                              shape=[-1, 1],
                              dtype='float32')
    tar_input = layers.data(name='tar_input',
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #tar_input --> y
    # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32')

    rnn_hidden_size = hidden_size
    if bi_direc:
        rnn_hidden_size //= 2

    enc_out, enc_last_hidden = \
        rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc")
    goal_out, goal_last_hidden = \
        rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc1")
    context_goal_out = fluid.layers.concat(
        input=[enc_last_hidden, goal_last_hidden], axis=2)
    context_goal_out = layers.reshape(context_goal_out,
                                      shape=[-1, 1, rnn_hidden_size * 4])
    # context_goal_out = layers.squeeze(context_goal_out, axes=[1])
    context_goal_out = fluid.layers.fc(context_goal_out,
                                       size=rnn_hidden_size * 2,
                                       bias_attr=False)
    context_goal_out = layers.unsqueeze(context_goal_out, axes=[0])
    bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge")
    bridge_out = layers.tanh(bridge_out)

    cue_last_mask = layers.data(name='cue_last_mask',
                                shape=[-1],
                                dtype='float32')
    knowledge_out, knowledge_last_hidden = \
        rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc")

    query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1])
    query = layers.squeeze(query, axes=[0])
    query = layers.unsqueeze(query, axes=[1])
    query = layers.reshape(query, shape=[batch_size, -1, hidden_size])
    cue_memory = layers.slice(knowledge_last_hidden,
                              axes=[0],
                              starts=[0],
                              ends=[1])
    cue_memory = layers.reshape(cue_memory,
                                shape=[batch_size, -1, hidden_size])
    memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1])

    weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask)

    cue_att = layers.reshape(cue_att, shape=[batch_size, -1])

    knowledge = weighted_cue
    if config.use_posterior:
        print("config.use_posterior", config.use_posterior)
        target_out, target_last_hidden = \
            rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                        dropout=0.0, batch_first=True, name="knowledge_enc1")
        target_goal_out = fluid.layers.concat(
            input=[target_last_hidden, goal_last_hidden], axis=2)
        target_goal_out = layers.reshape(target_goal_out,
                                         shape=[-1, 1, rnn_hidden_size * 4])
        # target_goal_out = layers.squeeze(target_goal_out, axes=[1])
        target_goal_out = fluid.layers.fc(target_goal_out,
                                          size=rnn_hidden_size * 2,
                                          bias_attr=False)
        target_goal_out = layers.unsqueeze(target_goal_out, axes=[0])

        # get attenion
        # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1])
        target_query = layers.slice(target_goal_out,
                                    axes=[0],
                                    starts=[0],
                                    ends=[1])
        target_query = layers.squeeze(target_query, axes=[0])
        target_query = layers.unsqueeze(target_query, axes=[1])
        target_query = layers.reshape(target_query,
                                      shape=[batch_size, -1, hidden_size])

        weight_target, target_att = dot_attention(target_query,
                                                  cue_memory,
                                                  mask=memory_mask)
        target_att = layers.reshape(target_att, shape=[batch_size, -1])
        # add to output
        knowledge = weight_target

    enc_memory_mask = layers.data(name="enc_memory_mask",
                                  shape=[-1, 1],
                                  dtype='float32')
    enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1])
    # decoder init_hidden, enc_memory, enc_mask
    dec_init_hidden = bridge_out
    pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32'))

    enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out,
                                                  pad_value=pad_value)
    enc_memory.persistable = True

    gru_unit = GRU_unit(input_size + hidden_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=0.0,
                        name="decoder_gru_unit")

    cue_gru_unit = GRU_unit(hidden_size + hidden_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=0.0,
                            name="decoder_cue_gru_unit")

    tgt_vocab_size = config.vocab_size
    if run_type == "train":
        if config.use_bow:
            bow_logits = fc(knowledge,
                            hidden_size,
                            hidden_size,
                            name='bow_fc_1')
            bow_logits = layers.tanh(bow_logits)
            bow_logits = fc(bow_logits,
                            hidden_size,
                            tgt_vocab_size,
                            name='bow_fc_2')
            bow_logits = layers.softmax(bow_logits)

            bow_label = layers.data(name='bow_label',
                                    shape=[-1, config.max_len],
                                    dtype='int64')
            bow_mask = layers.data(name="bow_mask",
                                   shape=[-1, config.max_len],
                                   dtype='float32')

            bow_logits = layers.expand(bow_logits, [1, config.max_len, 1])
            bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size])
            bow_label = layers.reshape(bow_label, shape=[-1, 1])
            bow_loss = layers.cross_entropy(bow_logits,
                                            bow_label,
                                            soft_label=False)
            bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len])

            bow_loss *= bow_mask
            bow_loss = layers.reduce_sum(bow_loss, dim=[1])
            bow_loss = layers.reduce_mean(bow_loss)

        dec_input = layers.data(name="dec_input",
                                shape=[-1, 1, 1],
                                dtype='int64')
        dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32')

        dec_knowledge = weight_target

        knowledge_goal_out = fluid.layers.concat(
            input=[dec_knowledge, target_query], axis=2)
        knowledge_goal_out = layers.reshape(knowledge_goal_out,
                                            shape=[-1, 1, rnn_hidden_size * 4])
        # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1])
        knowledge_goal_out = fluid.layers.fc(knowledge_goal_out,
                                             size=rnn_hidden_size * 2,
                                             bias_attr=False)
        knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0])

        decoder_logits = \
            rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers,
                         enc_memory, enc_memory_mask, dec_knowledge, vocab_size,
                         init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout)

        target_label = layers.data(name='target_label',
                                   shape=[-1, 1],
                                   dtype='int64')
        target_mask = layers.data(name='target_mask',
                                  shape=[-1, 1],
                                  dtype='float32')

        decoder_logits = layers.reshape(decoder_logits,
                                        shape=[-1, tgt_vocab_size])
        target_label = layers.reshape(target_label, shape=[-1, 1])

        nll_loss = layers.cross_entropy(decoder_logits,
                                        target_label,
                                        soft_label=False)
        nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1])
        nll_loss *= target_mask
        nll_loss = layers.reduce_sum(nll_loss, dim=[1])
        nll_loss = layers.reduce_mean(nll_loss)

        prior_attn = cue_att + 1e-10
        posterior_att = target_att
        posterior_att.stop_gradient = True

        prior_attn = layers.log(prior_attn)

        kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) -
                                   prior_attn)
        kl_loss = layers.reduce_mean(kl_loss)

        kl_and_nll_factor = layers.data(name='kl_and_nll_factor',
                                        shape=[1],
                                        dtype='float32')
        kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1])

        final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor

        return [bow_loss, kl_loss, nll_loss, final_loss]

    elif run_type == "test":
        beam_size = config.beam_size
        batch_size = config.batch_size
        token = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                     value=config.bos_id,
                                     dtype='int64')

        token = layers.reshape(token, shape=[-1, 1])
        max_decode_len = config.max_dec_len

        dec_knowledge = knowledge
        INF = 100000000.0

        init_score_np = np.ones([beam_size * batch_size],
                                dtype='float32') * -INF

        for i in range(batch_size):
            init_score_np[i * beam_size] = 0.0

        pre_score = layers.assign(init_score_np)

        pos_index_np = np.arange(batch_size).reshape(-1, 1)
        pos_index_np = \
            np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size

        pos_index = layers.assign(pos_index_np)

        id_array = []
        score_array = []
        index_array = []
        init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1])
        init_enc_memory = layers.reshape(
            init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size])
        init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1])
        init_enc_mask = layers.reshape(init_enc_mask,
                                       shape=[batch_size * beam_size, 1, -1])

        dec_knowledge = layers.reshape(dec_knowledge,
                                       shape=[-1, 1, hidden_size])
        init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1])
        init_dec_knowledge = layers.reshape(
            init_dec_knowledge,
            shape=[batch_size * beam_size, -1, hidden_size])

        dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size])
        dec_init_hidden = layers.reshape(dec_init_hidden,
                                         shape=[1, -1, hidden_size])

        length_average = config.length_average
        UNK = config.unk_id
        EOS = config.eos_id
        for i in range(1, max_decode_len + 1):
            dec_emb = get_embedding(token, input_size, vocab_size)
            dec_out, dec_last_hidden = \
                decoder_step(gru_unit, cue_gru_unit,
                             dec_emb, dec_init_hidden, input_size, hidden_size,
                             init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None)
            output_in_size = hidden_size + hidden_size

            rnnout = layers.dropout(dec_out,
                                    dropout_prob=config.dropout,
                                    is_test=True)
            rnnout = fc(rnnout,
                        output_in_size,
                        hidden_size,
                        name='dec_out_fc1')
            rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2')

            log_softmax_output = log_softmax(rnnout)
            log_softmax_output = layers.squeeze(log_softmax_output, axes=[1])

            if i > 1:
                if length_average:
                    log_softmax_output = layers.elementwise_add(
                        (log_softmax_output / i),
                        (pre_score * (1.0 - 1.0 / i)),
                        axis=0)
                else:
                    log_softmax_output = layers.elementwise_add(
                        log_softmax_output, pre_score, axis=0)
            else:
                log_softmax_output = layers.elementwise_add(log_softmax_output,
                                                            pre_score,
                                                            axis=0)

            log_softmax_output = layers.reshape(log_softmax_output,
                                                shape=[batch_size, -1])

            topk_score, topk_index = layers.topk(log_softmax_output,
                                                 k=beam_size)
            topk_score = layers.reshape(topk_score, shape=[-1])
            topk_index = layers.reshape(topk_index, shape=[-1])

            vocab_var = layers.fill_constant([1],
                                             dtype='int64',
                                             value=vocab_size)
            new_token = topk_index % vocab_var

            index = topk_index // vocab_var
            id_array.append(new_token)
            index_array.append(index)
            index = index + pos_index

            score_array.append(topk_score)

            eos_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=EOS)
            unk_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=UNK)
            eos_eq = layers.cast(layers.equal(new_token, eos_ids),
                                 dtype='float32')

            topk_score += eos_eq * -100000000.0

            unk_eq = layers.cast(layers.equal(new_token, unk_ids),
                                 dtype='float32')
            topk_score += unk_eq * -100000000.0

            # update
            token = new_token
            pre_score = topk_score
            token = layers.reshape(token, shape=[-1, 1])

            index = layers.cast(index, dtype='int32')
            dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0])
            dec_init_hidden = layers.gather(dec_last_hidden, index=index)
            dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0])
            init_enc_memory = layers.gather(init_enc_memory, index)
            init_enc_mask = layers.gather(init_enc_mask, index)
            init_dec_knowledge = layers.gather(init_dec_knowledge, index)

        final_score = layers.concat(score_array, axis=0)
        final_ids = layers.concat(id_array, axis=0)
        final_index = layers.concat(index_array, axis=0)

        final_score = layers.reshape(
            final_score, shape=[max_decode_len, beam_size * batch_size])
        final_ids = layers.reshape(
            final_ids, shape=[max_decode_len, beam_size * batch_size])
        final_index = layers.reshape(
            final_index, shape=[max_decode_len, beam_size * batch_size])

        return final_score, final_ids, final_index
Ejemplo n.º 18
0
    def _get_bboxes_single(self,
                           cls_scores,
                           bbox_preds,
                           mlvl_points,
                           img_shape,
                           scale_factor,
                           rescale=False,
                           with_nms=True):
        # mlvl_points 里面每个元素是[格子行数*格子列数, 3]  具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长)
        nms_cfg = self.nms_cfg
        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
        mlvl_bboxes = []
        mlvl_scores = []
        # 遍历每个fpn输出层
        for i_lvl, (cls_score, bbox_pred, points) in enumerate(
                zip(cls_scores, bbox_preds, mlvl_points)):
            # cls_score.shape = [80, h, w]
            # bbox_pred.shape = [ 4, h, w]
            # points.shape    = [h*w, 3]   具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长)
            cls_score = L.transpose(cls_score, [1, 2, 0])              # [h, w, 80]
            cls_score = L.reshape(cls_score, (-1, self.num_classes))   # [h*w, 80]
            if self.use_sigmoid_cls:
                scores = L.sigmoid(cls_score)   # [h*w, 80]
            else:
                scores = L.softmax(cls_score)
            bbox_pred = L.transpose(bbox_pred, [1, 2, 0])   # [h, w, 4]
            bbox_pred = L.reshape(bbox_pred, (-1, 4))       # [h*w, 4]
            nms_top_k = nms_cfg.get('nms_top_k', -1)
            if nms_top_k > 0 and scores.shape[0] > nms_top_k:
                if self.use_sigmoid_cls:
                    max_scores = L.reduce_max(scores, dim=1)
                else:
                    # remind that we set FG labels to [0, num_class-1]
                    # since mmdet v2.0
                    # BG cat_id: num_class
                    # max_scores, _ = scores[:, :-1].max(dim=1)
                    pass
                _, topk_inds = L.topk(max_scores, k=nms_top_k)
                scores = L.gather(scores, topk_inds)  # [M, 80]
                points = L.gather(points, topk_inds)  # [M, 3]   格子xy坐标、边长
                bbox_pred = L.gather(bbox_pred, topk_inds)  # [M, 4]

            # [M, 4]  格子xy坐标重复2次。格子左上角坐标。
            bbox_pos_center = L.concat([points[:, :2], points[:, :2]], axis=1)

            # [M, 4]  物体最终预测坐标(x1y1x2y2格式) = bbox_pred*格子边长 + 格子左上角坐标
            bboxes = bbox_pred * self.fpn_stride[i_lvl] + bbox_pos_center

            x1 = L.clip(bboxes[:, 0], 0.0, img_shape[1])
            y1 = L.clip(bboxes[:, 1], 0.0, img_shape[0])
            x2 = L.clip(bboxes[:, 2], 0.0, img_shape[1])
            y2 = L.clip(bboxes[:, 3], 0.0, img_shape[0])
            bboxes = paddle.stack([x1, y1, x2, y2], axis=-1)  # [M, 4]
            mlvl_bboxes.append(bboxes)
            mlvl_scores.append(scores)
        mlvl_scores = L.concat(mlvl_scores, axis=0)  # [M2, 80]  各个fpn层预测的分数汇合在一起
        mlvl_bboxes = L.concat(mlvl_bboxes, axis=0)  # [M2, 4]   各个fpn层预测的bbox(x1y1x2y2格式)汇合在一起
        if rescale:
            scale_factor_ = paddle.to_tensor(scale_factor)
            mlvl_bboxes /= scale_factor_  # [M2, 4]   预测的bbox(x1y1x2y2格式)

        pred_scores = L.unsqueeze(mlvl_scores, axes=0)  # [1, M2, 80]
        pred_boxes = L.unsqueeze(mlvl_bboxes, axes=0)   # [1, M2,  4],最终坐标
        pred_scores = L.transpose(pred_scores, perm=[0, 2, 1])  # [1, 80, M2],最终分数

        # nms
        pred = None
        i = 0
        nms_cfg = copy.deepcopy(self.nms_cfg)
        nms_type = nms_cfg.pop('nms_type')
        if nms_type == 'matrix_nms':
            pred = fluid.layers.matrix_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg)
        elif nms_type == 'multiclass_nms':
            pred = fluid.layers.multiclass_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg)
        return pred
Ejemplo n.º 19
0
    def beam_search():
        max_len = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
        step_idx = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=0)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(start_tokens, step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps to reduce redundant
        # computation in decoder.
        caches = [{
            "k": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0),
            "v": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0)
        } for i in range(n_layer)]
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # sequence_expand can gather sequences according to lod thus can be
            # used in beam search to sift states corresponding to selected ids.
            pre_src_attn_bias = layers.sequence_expand(
                x=trg_src_attn_bias, y=pre_scores)
            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
            pre_caches = [{
                "k": layers.sequence_expand(
                    x=cache["k"], y=pre_scores),
                "v": layers.sequence_expand(
                    x=cache["v"], y=pre_scores),
            } for cache in caches]
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_enc_output,  # cann't use pre_ids here since it has lod
                    value=1,
                    shape=[-1, 1],
                    dtype=pre_ids.dtype),
                y=layers.increment(
                    x=step_idx, value=1.0, in_place=False),
                axis=0)
            logits = wrap_decoder(
                trg_vocab_size,
                max_in_len,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                dropout_rate,
                weight_sharing,
                dec_inputs=(
                    pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape,
                    slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape,
                    src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
                enc_output=pre_enc_output,
                caches=pre_caches)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(
                x=layers.log(topk_scores),
                y=layers.reshape(
                    pre_scores, shape=[-1]),
                axis=0)
            # beam_search op uses lod to distinguish branches.
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            selected_ids, selected_scores = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # update states
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            layers.assign(pre_enc_output, enc_output)
            for i in range(n_layer):
                layers.assign(pre_caches[i]["k"], caches[i]["k"])
                layers.assign(pre_caches[i]["v"], caches[i]["v"])
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_pre_softmax_shape,
                    y=attn_pre_softmax_shape_delta),
                slf_attn_pre_softmax_shape)
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_post_softmax_shape,
                    y=attn_post_softmax_shape_delta),
                slf_attn_post_softmax_shape)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Ejemplo n.º 20
0
input_sequence = layers.data(name = "story", dtype = "int64", shape = [-1, story_maxlen, 1])
question = layers.data(name = "query", dtype = "int64", shape = [-1, query_maxlen, 1])
true_answer = layers.data(name = "true_answer", dtype = "int64", shape = [-1, 1])

input_encoder_m = layers.embedding(input = input_sequence, size = [vocab_size, 64])
input_encoder_m = layers.dropout(input_encoder_m, 0.3)

input_encoder_c = layers.embedding(input = input_sequence, size = [vocab_size, query_maxlen])
input_encoder_c = layers.dropout(input_encoder_c, 0.3)

question_encoder = layers.embedding(input = input_sequence, size = [vocab_size, 64])
question_encoder = layers.dropout(question_encoder, 0.3)

match = layers.elementwise_mul(input_encoder_m, question_encoder)
response = layers.softmax(match, axis = -1)

answer = layers.concat([response, question_encoder], axis = -1)

_, _, answer = basic_lstm(answer, None, None, 32)
answer = layers.transpose(answer, perm = (1, 0, 2))
answer = layers.reshape(answer, shape = [-1, 32])

answer = layers.dropout(answer, 0.3)
answer = layers.fc(answer, size = vocab_size, act = "softmax")

loss = layers.cross_entropy(answer, true_answer)
loss = layers.reduce_mean(loss)

optimizer = fluid.optimizer.AdamOptimizer(learning_rate = 0.01)
optimizer.minimize(loss)
Ejemplo n.º 21
0
    def forward(self, ref_image, ref_label, label, k):
        """
        Encode the reference image to get features for weight generation.

        Args:

            ref_image ((NxK)x3xHxW): Reference images.
            ref_label ((NxK)xCxHxW): Reference labels.
            label (NxCxHxW): Target label.
            k (int): Number of reference images.
        
        Returns: (tuple)
            - x (NxC2xH2xW2): Encoded features from reference images
              for the main branch (as input to the decoder).
            - encoded_ref (list of Variable): Encoded features from reference
              images for the weight generation branch.
            - attention (Nx(KxH1xW1)x(H1xW1)): Attention maps.
            - atn_vis (1x1xH1xW1): Visualization for attention scores.
            - ref_idx (Nx1): Index for which image to use from the
              reference image.
        """
        if self.concat_ref_label:
            # concat reference label map and image together for encoding.
            concat_ref = L.concat([ref_image, ref_label], axis=1)
            x = self.ref_img_first(concat_ref)
        elif self.mul_ref_label:
            x = self.ref_img_first(ref_image)
            x_label = self.ref_label_first(ref_label)
        else:
            x = self.ref_img_first(ref_image)

        atn_ref_image = atn_ref_label = None
        atn = atn_vis = ref_idx = None
        for i in range(self.num_downsamples):
            x = getattr(self, 'ref_img_down_' + str(i))(x)
            if self.mul_ref_label:
                x_label = getattr(self, 'ref_label_down_' + str(i))(x_label)
            # Preserve reference for attention module.
            if k > 1 and i == self.num_downsample_atn - 1:
                x, atn, atn_vis = self.attention_module(x, label, ref_label)
                if self.mul_ref_label:
                    x_label, _, _ = self.attention_module(
                        x_label, None, None, atn)

                atn_sum = L.reshape(atn,
                                    (label.shape[0], k, -1))  # [b, k, h*w*h*w]
                atn_sum = L.reduce_sum(atn_sum, dim=2)
                ref_idx = L.argmax(atn_sum, axis=1)

        # Get all corresponding layers in the encoder output for generating
        # weights in corresponding layers.
        encoded_image_ref = [x]
        if self.mul_ref_label:
            encoded_ref_label = [x_label]

        for i in reversed(range(self.num_downsamples)):  # 4 -> 0
            conv = getattr(self, 'ref_img_up_' + str(i))(encoded_image_ref[-1])
            encoded_image_ref.append(conv)
            if self.mul_ref_label:
                conv_label = getattr(self, 'ref_label_up_' + str(i))(
                    encoded_ref_label[-1])
                encoded_ref_label.append(conv_label)

        if self.mul_ref_label:
            encoded_ref = []
            for i in range(len(encoded_image_ref)):
                conv, conv_label = encoded_image_ref[i], encoded_ref_label[i]
                b, c, h, w = conv.shape
                conv_label = L.softmax(conv_label, axis=1)
                conv_label = L.reshape(conv_label, (b, 1, c, h * w))
                # conv_label = L.expand(conv_label, (1, c, 1, 1))
                conv = L.reshape(conv, (b, c, 1, h * w))
                # conv = L.expand(conv, (1, 1, c, 1))
                conv_prod = conv * conv_label  # (b, c, c, h * w)
                conv_prod = L.reduce_sum(conv_prod, dim=3,
                                         keep_dim=True)  # (b, c, c, 1)
                encoded_ref.append(conv_prod)
        else:
            encoded_ref = encoded_image_ref

        encoded_ref = encoded_ref[::-1]  # level0 -> level4
        return x, encoded_ref, atn, atn_vis, ref_idx
Ejemplo n.º 22
0
    def scaled_dot_product_attention_with_sen_norm(q, k, v, attn_bias, d_key,
                                                   dropout_rate, attn_s):
        """
        Scaled Dot-Product Attention with sentence-level normalize
        :param q: (batch_size, n_head, tgt_len, dim_per_head)
        :param k: (batch_size, n_blocks, n_head, n_tokens, dim_per_head)
        :param v: (batch_size, n_blocks, n_head, n_tokens, dim_per_head)
        :param attn_bias: (batch_size, n_blocks, n_head, tgt_len, n_tokens)
        :param attn_s:  [batch, n_heads, query_len, key_s_len]
        :return:
        """
        # print("q.shape = %s" % str(q.shape))
        # (batch_size, n_block, n_head, tgt_len, dim_per_head)
        q = layers.expand(layers.unsqueeze(q, axes=[1]),
                          expand_times=[1, key_s_len, 1, 1, 1])
        # print("q.shape = %s" % str(q.shape))
        # (batch_size*n_block, n_head, tgt_len, dim_per_head)
        # q = layers.reshape(q, shape=[-1, n_head, query_len, d_key])
        # print("q.shape = %s" % str(q.shape))

        scaled_q = layers.scale(x=q, scale=d_key**-0.5)

        # (batch_size, n_block, n_head, tgt_len, n_token)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)

        if attn_bias:
            product += attn_bias  # (batch_size, n_block, n_head, tgt_len, n_token)

        weights = layers.softmax(
            product)  # (batch_size, n_block, n_head, tgt_len, n_token)

        # attn_w = layers.reshape(weights, shape=[batch_size, key_s_len, n_head, query_len, -1])
        # (batch_size, n_head, tgt_len, n_block, n_token)
        attn_w = layers.transpose(weights, perm=[0, 2, 3, 1, 4])
        # (batch_size, n_head, tgt_len, n_block, n_token)
        attn_w = layers.elementwise_mul(attn_w,
                                        layers.unsqueeze(attn_s, axes=[-1]),
                                        axis=0)
        # (batch_size, n_head, tgt_len, n_block*n_token)
        attn_w = layers.reshape(attn_w,
                                shape=[batch_size, n_head, query_len, -1])

        if dropout_rate:
            attn_w = layers.dropout(  # (batch_size, n_head, tgt_len, n_block*n_token)
                attn_w,
                dropout_prob=dropout_rate,
                seed=dropout_seed,
                dropout_implementation="upscale_in_train",
                is_test=False)

        # values_w = layers.reshape(v, shape=[batch_size, key_s_len, n_head, -1, d_value])
        values_w = layers.transpose(v, perm=[0, 2, 1, 3, 4])
        # (batch_size, n_head, n_block*n_token, dim_per_head)
        values_w = layers.reshape(values_w,
                                  shape=[batch_size, n_head, -1, d_value])

        out = layers.matmul(
            attn_w, values_w)  # (batch_size, n_head, tgt_len, dim_per_head)

        # Project back to the model size.
        combine_out = __combine_heads_word(
            out)  # (batch_size, query_len, emb_dim)

        proj_out = layers.fc(
            input=combine_out,  # (batch_size, tgt_len, model_dim)
            size=d_model,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(name=name + '_word_fc.w_0',
                                       initializer=param_initializer),
            bias_attr=name + '_word_fc.b_0')

        return proj_out, attn_w
Ejemplo n.º 23
0
    def _build_decoder(self,
                       z_mean=None,
                       z_log_var=None,
                       enc_output=None,
                       mode='train',
                       beam_size=10):
        dec_input = layers.dropout(self.tar_emb,
                                   dropout_prob=self.dec_dropout_in,
                                   dropout_implementation="upscale_in_train")

        # `output_layer` will be used within BeamSearchDecoder
        output_layer = lambda x: layers.fc(x,
                                           size=self.tar_vocab_size,
                                           num_flatten_dims=len(x.shape) - 1,
                                           name="output_w")

        # `sample_output_layer` samples an id from the logits distribution instead of argmax(logits)
        # it will be used within BeamSearchDecoder
        sample_output_layer = lambda x: layers.unsqueeze(
            fluid.one_hot(layers.unsqueeze(
                layers.sampling_id(layers.softmax(
                    layers.squeeze(output_layer(x), [1])),
                                   dtype='int'), [1]),
                          depth=self.tar_vocab_size), [1])

        if mode == 'train':
            latent_z = self._sampling(z_mean, z_log_var)
        else:
            latent_z = layers.gaussian_random_batch_size_like(
                self.tar, shape=[-1, self.latent_size])
        dec_first_hidden_cell = layers.fc(latent_z,
                                          2 * self.hidden_size *
                                          self.num_layers,
                                          name='fc_hc')
        dec_first_hidden, dec_first_cell = layers.split(
            dec_first_hidden_cell, 2)
        if self.num_layers > 1:
            dec_first_hidden = layers.split(dec_first_hidden, self.num_layers)
            dec_first_cell = layers.split(dec_first_cell, self.num_layers)
        else:
            dec_first_hidden = [dec_first_hidden]
            dec_first_cell = [dec_first_cell]
        dec_initial_states = [[h, c]
                              for h, c in zip(dec_first_hidden, dec_first_cell)
                              ]
        dec_cell = DecoderCell(self.num_layers, self.hidden_size, latent_z,
                               self.param_attr_initializer,
                               self.param_attr_scale, self.dec_dropout_out)

        if mode == 'train':
            dec_output, _ = rnn(cell=dec_cell,
                                inputs=dec_input,
                                initial_states=dec_initial_states,
                                sequence_length=self.tar_sequence_length)
            dec_output = output_layer(dec_output)

            return dec_output
        elif mode == 'greedy':
            start_token = 1
            end_token = 2
            max_length = 100
            beam_search_decoder = BeamSearchDecoder(
                dec_cell,
                start_token,
                end_token,
                beam_size=1,
                embedding_fn=self.tar_embeder,
                output_fn=output_layer)
            outputs, _ = dynamic_decode(beam_search_decoder,
                                        inits=dec_initial_states,
                                        max_step_num=max_length)
            return outputs

        elif mode == 'sampling':
            start_token = 1
            end_token = 2
            max_length = 100
            beam_search_decoder = BeamSearchDecoder(
                dec_cell,
                start_token,
                end_token,
                beam_size=1,
                embedding_fn=self.tar_embeder,
                output_fn=sample_output_layer)

            outputs, _ = dynamic_decode(beam_search_decoder,
                                        inits=dec_initial_states,
                                        max_step_num=max_length)
            return outputs
        else:
            print("mode not supprt", mode)
Ejemplo n.º 24
0
    def infilling_decode(self):
        if self.task_type == "dialog":
            emb_num = 4
        else:
            emb_num = 3
        input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \
                       [[-1, self.max_seq_len, self.max_seq_len]]
        input_dtypes = ['int64'] * emb_num + ['float32']
        input_lod_levels = [0] * emb_num + [0]

        shapes = input_shapes + [[-1, self.max_seq_len, 1],
                                 [-1, self.max_seq_len, 1], [-1, 1], [-1],
                                 [-1, 1, self.max_seq_len], [-1, 1]]
        dtypes = input_dtypes + [
            'int64', 'int64', 'float32', 'int32', 'float32', 'int64'
        ]
        lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0]

        inputs = self.to_ternsor(shapes, dtypes, lod_levels)
        pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                      capacity=50,
                                                      iterable=False)

        emb_ids = {}
        for key, value in zip(self.emb_keys, inputs[:emb_num]):
            emb_ids[key] = value

        input_mask = inputs[emb_num]
        tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[
            -6:]

        ernie = ErnieModel(emb_ids=emb_ids,
                           input_mask=input_mask,
                           config=self.ernie_config,
                           use_fp16=self.use_fp16,
                           task_type=self.task_type,
                           decoding=True,
                           gather_idx=parent_idx)

        max_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.max_dec_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=tgt_ids.dtype,
                                        value=0,
                                        force_cpu=True)
        pos_idx = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=1,
                                       force_cpu=True)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)),
                                        step_idx)
        scores = layers.array_write(init_scores, step_idx)
        tgt_masks = layers.array_write(tgt_input_mask, step_idx)

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)
            tmp_mask = layers.array_read(tgt_masks, i=step_idx)

            def gen_batch_like(value,
                               dtype="int64",
                               shape=[-1, 1, 1],
                               is_scalar=True):
                if is_scalar:
                    return layers.fill_constant_batch_size_like(
                        input=parent_idx,
                        value=value,
                        shape=shape,
                        dtype=dtype)
                else:
                    return layers.elementwise_mul(
                        x=layers.fill_constant_batch_size_like(
                            input=parent_idx,
                            value=1,
                            shape=shape,
                            dtype=dtype),
                        y=value,
                        axis=0)

            tmp_mask = layers.gather(input=tmp_mask, index=parent_idx)
            append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype)
            append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype)
            tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2)
            pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2)
            cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2)

            cur_ids = gen_batch_like(self.attn_id)
            pre_pos = gen_batch_like(step_idx, is_scalar=False)
            cur_pos = gen_batch_like(pos_idx, is_scalar=False)
            if self.continuous_position:
                pre_pos = pre_pos + pos_bias
                cur_pos = cur_pos + pos_bias

            dec_emb_ids = {
                "word_embedding": layers.concat([pre_ids, cur_ids], axis=1),
                "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1)
            }
            if self.task_type == "dialog":
                role_ids = gen_batch_like(0)
                turn_ids = gen_batch_like(0)
                dec_emb_ids["role_embedding"] = layers.concat(
                    [role_ids, role_ids], axis=1)
                dec_emb_ids["turn_embedding"] = layers.concat(
                    [turn_ids, turn_ids], axis=1)
            else:
                sent_ids = gen_batch_like(self.tgt_type_id)
                dec_emb_ids["sent_embedding"] = layers.concat(
                    [sent_ids, sent_ids], axis=1)
            dec_mask = layers.concat([pre_mask, cur_mask], axis=1)

            dec_out = ernie.encode(dec_emb_ids,
                                   dec_mask,
                                   parent_idx,
                                   remove_query=True)
            fc_out = self.cal_logit(dec_out[:, 1:, :], None)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(fc_out), k=self.beam_size)
            pre_lenpen = layers.pow(
                (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            cur_lenpen = layers.pow(
                (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores * pre_lenpen,
                                                 axis=0) / cur_lenpen
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=self.beam_size,
                end_id=self.eos_idx,
                return_parent_idx=True)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            layers.increment(x=pos_idx, value=1.0, in_place=True)
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(tmp_mask, i=step_idx, array=tgt_masks)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars
Ejemplo n.º 25
0
            if args.init_checkpoint is not None:
                print('loading checkpoint from %s' % args.init_checkpoint)
                sd, _ = FD.load_dygraph(args.init_checkpoint)
                model.set_dict(sd)

        test_batch_data = batchify(test_features, args.bsz, args.max_seqlen)
        if args.debug:
            print(len(test_batch_data))
            print(test_batch_data[0])
            token_ids, seg_ids, labels = test_batch_data[0]
            for r1, r2 in zip(token_ids[:5], seg_ids[:5]):
                print(r1)
                print(r2)
                print(convert_ids_to_tokens(tokenizer.vocab, r1))        
        y_pred = []
        with FD.base._switch_tracer_mode_guard_(is_train=False):
            model.eval()
            for step, d in enumerate(tqdm(test_batch_data, desc='predicting')):
                ids, sids, _ = d
                ids, sids = FD.to_variable(ids), FD.to_variable(sids)
                _, logits = model(ids, sids)
                #print('\n'.join(map(str, logits.numpy().tolist())))
                y_pred += L.softmax(logits, -1).numpy().tolist()
                if args.debug and len(y_pred) > 5:
                    break

    print(len(y_pred), y_pred[:5])
    print(test_segs[:5])

    with open(args.save_path, 'wb') as f:
        pickle.dump({'segs': test_segs, 'y_pred': y_pred}, f)
Ejemplo n.º 26
0
    def forward(self, outputs, targets):
        """
        Performs the matching

        Params:
            outputs: This is a dict contains at least these entries:
                "pred_logits": Tensor of dim[batch_size, num_queries, num_classes] with the classification logits
                "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicated box coordinates
            
            targets: This is a list of targets (len(targets) == batch_size), where each target is a dict containing:
                "labels": Tensor of dim[num_target_boxes] (where num_target_boxes is the number of ground-truth)
                          objects in the target) containing the class labels
                "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordiantes
        
        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        with dg.no_grad():
            bs, num_queries, num_classes = outputs["pred_logits"].shape

            # We flatten to compute the cost matrices in a batch
            out_prob = L.reshape(
                outputs["pred_logits"],
                [-1, num_classes])  # [batch_size * num_queries, num_classes]
            out_prob = L.softmax(
                out_prob, axis=-1)  # [batch_size * num_queries, num_classes]
            out_bbox = L.reshape(outputs["pred_boxes"],
                                 [-1, 4])  # [batch_size * num_queries, 4]

            # Alse concat the target labels and boxes
            tgt_ids = L.concat([v["labels"] for v in targets]).astype(
                "int64")  # [batch_size * num_target_boxes_i]
            tgt_bbox = L.concat([v["boxes"] for v in targets]).astype(
                "float32")  # [batch_size * num_target_boxes_i]

            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
            # but approximate it in 1 - proba[target class].
            # The 1 is a constant that donesn't change the matching, it can be ommitted.
            cost_class = -out_prob.numpy()[:, tgt_ids.numpy(
            )]  # [batch_size * num_queries, num_all_target_boxes]
            cost_class = dg.to_variable(cost_class)

            # Compute the L1 cost between boxes
            num_all_target_boxes = tgt_bbox.shape[0]
            expanded_out_bbox = L.expand(
                L.unsqueeze(out_bbox, [1]),
                [1, num_all_target_boxes, 1
                 ])  # [batch_size * num_queries, num_all_target_boxes, 4]
            expanded_tgt_bbox = L.expand(
                L.unsqueeze(tgt_bbox, [0]),
                [bs * num_queries, 1, 1
                 ])  # [batch_size * num_queries, num_all_target_boxes, 4]
            cost_bbox = F.loss.l1_loss(
                expanded_out_bbox, expanded_tgt_bbox, reduction='none'
            )  # [batch_size * num_queries, num_all_target_boxes, 4]
            cost_bbox = L.reduce_mean(
                cost_bbox,
                -1)  # [batch_size * num_queries, num_all_target_boxes]

            # Compute the giou cost between boxes
            cost_giou = -generalied_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                            box_cxcywh_to_xyxy(tgt_bbox))

            # Final cost matrix
            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
            C = L.reshape(
                C, [bs, num_queries, -1
                    ])  # [batch_size, num_queries, num_all_target_boxes]

            sizes = [len(v["boxes"]) for v in targets]

            indices = [
                linear_sum_assignment(c[i].numpy())
                for i, c in enumerate(L.split(C, sizes, dim=-1))
            ]

            return [(dg.to_variable(i.astype("int64")),
                     dg.to_variable(j.astype("int64"))) for i, j in indices]
Ejemplo n.º 27
0
def KL(pred, target):
    pred = L.log(L.softmax(pred))
    target = L.softmax(target)
    loss = L.kldiv_loss(pred, target)
    return loss
Ejemplo n.º 28
0
    def inference(self, model, inputs, outputs):
        """
        Run inference.

        Args:
            inputs(dict): Its key is input name(str) and its value is a Variable.
            model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.

        Returns:
            dict(str:Variable): Its key is output name(str) and its value is a Variable.
        """
        # prepare while loop
        max_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True)
        min_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True)
        step_idx = layers.fill_constant(
            shape=[1], dtype="int64", value=0, force_cpu=True)

        ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
        scores = layers.array_write(inputs["init_score"], step_idx)
        tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx)
        parent_idx = inputs["parent_idx"]

        if self.decoding_strategy == "beam_search":
            beam_size = self.beam_size
        else:
            beam_size = 1

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e9
        eos_penalty = layers.assign(eos_penalty)

        token_penalty = np.zeros(self.vocab_size, dtype="float32")
        token_penalty[self.unk_id] = -1e9
        if self.mask_id >= 0:
            token_penalty[self.mask_id] = -1e9
        token_penalty = layers.assign(token_penalty)

        # start while loop
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx)
            dtype = tmp_tgt_generation_mask.dtype

            append_mask = layers.fill_constant_batch_size_like(
                    input=pre_ids,
                    value=1.0,
                    shape=[-1, 1, 1],
                    dtype=dtype)
            tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2)
            pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx)

            pre_sent = layers.fill_constant_batch_size_like(
                    input=pre_mask,
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype)

            if self.continuous_position:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias
            else:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0)

            if self.use_role:
                pre_role = layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=0,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype)
            else:
                pre_role = None

            dec_out, _ = model._generation_network(
                token_ids=pre_ids,
                type_ids=pre_sent,
                pos_ids=pre_pos,
                role_ids=pre_role,
                generation_mask=tmp_tgt_generation_mask,
                gather_idx=parent_idx)
            logits = model._calc_logits(dec_out)

            # ignore unk and mask token
            if self.ignore_unk:
                logits = layers.elementwise_add(logits, token_penalty, axis=1)

            # min dec length
            min_len_cond = layers.less_than(x=step_idx, y=min_len)
            def min_len_penalty():
                """Plus minimum length penalty."""
                return layers.elementwise_add(logits, eos_penalty, axis=1)
            def no_penalty():
                """No penalty."""
                return logits
            logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty)

            # get probs
            probs = layers.softmax(logits / self.temperature)

            if self.decoding_strategy == "beam_search":
                topk_scores, topk_indices = layers.topk(
                    input=probs, k=beam_size)
            else:
                if self.decoding_strategy.startswith("sampling"):
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                elif self.decoding_strategy.startswith("topk_sampling"):
                    topk_probs, _ = layers.topk(input=probs, k=self.topk)
                    ge_cond = layers.cast(
                        layers.greater_equal(
                            probs,
                            layers.unsqueeze(topk_probs[:, -1], [1])),
                        "float32")
                    old_probs = probs
                    probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    probs = old_probs
                else:
                    raise ValueError(self.decoding_strategy)

                sampling_scores = layers.one_hot(
                    layers.unsqueeze(sampling_ids, [1]), probs.shape[1]
                )
                sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3
                topk_scores, topk_indices = layers.topk(
                    input=sampling_scores, k=1)

            pre_len = layers.cast(step_idx, "float32")
            layers.increment(x=step_idx, value=1.0, in_place=True)
            cur_len = layers.cast(step_idx, "float32")

            # update scores
            if self.length_average:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len
            elif self.length_penalty > 0:
                pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
                cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp
            else:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores, axis=0)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=self.eos_id)

        predictions = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "token_ids": inputs["token_ids"],
            "data_id": inputs["data_id"]
        }
        return predictions
Ejemplo n.º 29
0
def wrap_decoder(trg_vocab_size,
                 max_length,
                 n_layer,
                 n_head,
                 d_key,
                 d_value,
                 d_model,
                 d_inner_hid,
                 prepostprocess_dropout,
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd,
                 postprocess_cmd,
                 weight_sharing,
                 embedding_sharing,
                 dec_inputs=None,
                 enc_output=None,
                 caches=None, is_train=True, params_type="normal"):
    """
    The wrapper assembles together all needed layers for the decoder.
    """
    if dec_inputs is None:
        # This is used to implement independent decoder program in inference.
        trg_word, reverse_trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \
            make_all_inputs(decoder_data_input_fields)
    else:
        trg_word, reverse_trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs

    dec_input = prepare_decoder(
        trg_word,
        trg_pos,
        trg_vocab_size,
        d_model,
        max_length,
        prepostprocess_dropout,
        word_emb_param_name=word_emb_param_names[0]
        if embedding_sharing else word_emb_param_names[1], 
        training=is_train,
        params_type=params_type)

    dec_output = decoder(
        dec_input,
        enc_output,
        trg_slf_attn_bias,
        trg_src_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        prepostprocess_dropout,
        attention_dropout,
        relu_dropout,
        preprocess_cmd,
        postprocess_cmd,
        caches=caches)
    # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
    dec_output = layers.reshape(
        dec_output, shape=[-1, dec_output.shape[-1]], inplace=True)

    assert params_type == "fixed" or params_type == "normal" or params_type == "new"
    pre_name = "forwardforward"
    if params_type == "fixed":
        pre_name = "fixed_forwardfixed_forward"
    elif params_type == "new":
        pre_name = "new_forwardnew_forward"
    if weight_sharing and embedding_sharing:
        predict = layers.matmul(
            x=dec_output,
            y=fluid.default_main_program().global_block().var(
                pre_name + word_emb_param_names[0]),
            transpose_y=True)
    elif weight_sharing:
        predict = layers.matmul(
            x=dec_output,
            y=fluid.default_main_program().global_block().var(
                pre_name +  word_emb_param_names[1]),
            transpose_y=True)
    else:
        predict = layers.fc(input=dec_output,
                            size=trg_vocab_size,
                            bias_attr=False)
    if dec_inputs is None:
        # Return probs for independent decoder program.
        predict = layers.softmax(predict)
    return predict
Ejemplo n.º 30
0
    def net(self, class_dim=5, CAM=False):
        """Create second stage model
         Args:
         class_dim: dim of multi-class vector
         CAM:  是否创建CAM heatmap
         Returns:
         * A list contain 4/5 tensors / ops:
             - loss, cross-entropy loss tensor
             - accuracy, accuracy metric tensor
             - predict, model output tensor activated by softmax
             - hacked_img_id, img_id tensor
             - cam_heatmap, only if CAM == True, class activation map tensor
         * reader, reader op to feed data into placeholder
         """
        self.input_feature = fluid.data(name='{}_input'.format(self.name),
                                        shape=[-1] + self.data_shape,
                                        dtype='uint8')
        self.label = fluid.data(name='{}_label'.format(self.name),
                                shape=[-1, 1],
                                dtype='int64')
        self.img_id = fluid.data(name='{}_img_id'.format(self.name),
                                 shape=[-1, 1],
                                 dtype='int64')

        # Lesion Net
        lesion = lesionnet.LesionNet()

        # Backbone
        if self.main_arch in ResNetModels:
            model = resnet.__dict__[self.main_arch]()
        elif self.main_arch in DenseNetModels:
            model = densenet.__dict__[self.main_arch]()
        elif self.main_arch == "inception":
            model = inception.InceptionV4()
        else:
            raise ValueError("Model {} is not supported.".format(
                self.main_arch))

        inp = FL.transpose(FL.cast(self.input_feature, "float32"),
                           perm=[0, 3, 1, 2]) / 255.

        # Element wise mul of lesion prob maps and input image
        lesion_probs = lesion.net(inp, class_dim=4)  # bs, 4, 16, 16
        lesion_probs = FL.split(lesion_probs, num_or_sections=4,
                                dim=1)  # probs, bs*1*16*16 4

        I = FL.image_resize(inp, out_shape=(512, 512), resample="BILINEAR")
        Is = []

        for L in lesion_probs:
            W = FL.image_resize(L, out_shape=(512, 512),
                                resample="NEAREST")  # bs, 1, 512, 512
            temp_I = FL.elementwise_mul(
                I, FL.expand(W + 1.,
                             expand_times=[1, 3, 1,
                                           1]))  # W + 1., bs, 3, 512, 512
            Is.append(temp_I)
        I = FL.concat(Is, axis=1)  # bs, 3*4, 512, 512
        I.stop_gradient = True

        lesion_pos_prob = 1. - lesion_probs[0]
        main_arch_out = model.net(I,
                                  class_dim=class_dim,
                                  lesion_map=lesion_pos_prob,
                                  CAM=CAM)

        if CAM:
            logit, heatmaps = main_arch_out
        else:
            logit = main_arch_out

        predict = FL.softmax(logit)
        accuracy = self.create_acc_op(predict, self.label)
        loss = self.create_loss_op(predict, self.label)
        reader = self.create_reader_op(
            [self.img_id, self.input_feature, self.label])

        # This is a hack
        hacked_img_id = FL.cast(self.img_id, "int32")

        if CAM:
            cam_heatmap = self.create_cam_op(predict, class_dim, heatmaps)
            return [loss, accuracy, predict, hacked_img_id,
                    cam_heatmap], reader

        return [loss, accuracy, predict, hacked_img_id], reader
Ejemplo n.º 31
0
def soft_cross_entropy(inp, target):
    inp_likelihood = L.log_softmax(inp, axis=-1)
    target_prob = L.softmax(target, axis=-1)
    return -1. * L.mean(L.reduce_sum(inp_likelihood * target_prob, dim=-1))