Ejemplo n.º 1
0
    def forward(self, x, speaker_embed=None):
        """
        Convert mel spectrogram or decoder hidden states to linear spectrogram.
        
        Args:
            x (Variable): Shape(B, T_mel, C_in), dtype float32, converter inputs, where C_in means the input channel for the converter. Note that it can be either C_mel (channel of mel spectrogram) or C_dec // r.
                When use mel_spectrogram as the input of converter, C_in = C_mel; and when use decoder states as the input of converter, C_in = C_dec // r.
            speaker_embed (Variable, optional): shape(B, C_sp), dtype float32, speaker embedding, where C_sp means the speaker embedding size.

        Returns:
            out (Variable): Shape(B, T_lin, C_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling of the converter.
        """
        x = F.transpose(x, [0, 2, 1])
        x = self.first_conv_proj(x)

        if speaker_embed is not None:
            speaker_embed = F.dropout(
                speaker_embed,
                self.dropout,
                dropout_implementation="upscale_in_train")

        for layer in chain(self.upsampling_convolutions, self.convolutions):
            if isinstance(layer, Conv1DGLU):
                x = layer(x, speaker_embed)
            else:
                x = layer(x)

        out = self.last_conv_proj(x)
        out = F.transpose(out, [0, 2, 1])
        return out
Ejemplo n.º 2
0
    def forward(self, encoder_output):
        """
        Predict the duration of each character.
        
        Args:
            encoder_output (Variable): shape(B, T, C), dtype float32, the encoder output.
        
        Returns:
            out (Variable): shape(B, T, C), the output of duration predictor.
        """
        # encoder_output.shape(N, T, C)
        out = layers.transpose(encoder_output, [0, 2, 1])
        out = self.conv1(out)
        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm1(out)),
                             self.dropout,
                             dropout_implementation='upscale_in_train')
        out = layers.transpose(out, [0, 2, 1])
        out = self.conv2(out)
        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm2(out)),
                             self.dropout,
                             dropout_implementation='upscale_in_train')
        out = layers.relu(self.linear(out))
        out = layers.squeeze(out, axes=[-1])

        return out
Ejemplo n.º 3
0
    def forward(self, input):
        x = self.model(input)

        gap = adaptive_pool2d(x, 1, pool_type='avg')
        gap_logit = self.gap_fc(reshape(gap, shape=[x.shape[0], -1]))
        gap_weight = list(self.gap_fc.parameters())[0]
        gap_weight = transpose(gap_weight, perm=[1, 0])
        gap = x * unsqueeze(unsqueeze(gap_weight, 2), 3)

        gmp = adaptive_pool2d(x, 1, pool_type='max')
        gmp_logit = self.gmp_fc(reshape(gmp, shape=[x.shape[0], -1]))
        gmp_weight = list(self.gmp_fc.parameters())[0]
        gmp_weight = transpose(gmp_weight, perm=[1, 0])
        gmp = x * unsqueeze(unsqueeze(gmp_weight, 2), 3)

        cam_logit = concat([gap_logit, gmp_logit], 1)
        x = concat([gap, gmp], 1)
        x = self.leaky_relu(self.conv1x1(x))

        heatmap = reduce_sum(x, dim=1, keep_dim=True)

        x = self.pad(x)
        out = self.conv(x)

        return out, cam_logit, heatmap
Ejemplo n.º 4
0
    def forward(self, input):
        """
        Compute feed forward network result.
        
        Args:
            input (Variable): shape(B, T, C), dtype float32, the input value. 
                
        Returns:
            output (Variable): shape(B, T, C), the result after FFN. 
        """
        x = layers.transpose(input, [0, 2, 1])
        #FFN Networt
        x = self.w_2(layers.relu(self.w_1(x)))

        # dropout
        x = layers.dropout(x,
                           self.dropout,
                           dropout_implementation='upscale_in_train')

        x = layers.transpose(x, [0, 2, 1])
        # residual connection
        x = x + input

        #layer normalization
        output = self.layer_norm(x)

        return output
Ejemplo n.º 5
0
    def detect(self, batch_idx, conf_preds, decoded_boxes, mask_data):
        """ Perform nms for only the max scoring class that isn't background (class 0) """
        # 确实是先坐标全部解码完成,在进行分数过滤。可以考虑过滤后再进行坐标解码
        cur_scores = conf_preds[batch_idx, 1:, :]
        conf_scores = P.reduce_max(cur_scores, dim=0)
        '''
        gpu版本的paddlepaddle1.6.2里有一个问题。keep如果是[None],并且在gather()里使用了keep,就会出现
        cudaGetLastError  invalid configuration argument errno: 9   这个错误。cpu版本则可以正常跑。
        为了避免上面的问题,只能让keep不是[None],所以这里给keep额外添加了一个元素keep_extra。
        '''
        keep = P.where(conf_scores > self.conf_thresh)
        keep_extra = P.where(conf_scores < self.conf_thresh)
        keep_extra = keep_extra[:1]
        keep = P.concat([keep, keep_extra], axis=0)
        scores = P.gather(P.transpose(cur_scores, perm=[1, 0]), keep)
        scores = P.transpose(scores, perm=[1, 0])
        boxes = P.gather(decoded_boxes, keep)
        masks = P.gather(mask_data[batch_idx], keep)
        '''
        因为上面增加了一个keep_extra,所以keep一定至少有一个预测框。
        当官方修复了上述问题后,删除上面keep_extra的代码,下面的代码解除注释。
        这么做的原因是判断keep为空太难了。
        '''
        # 可能没有框被保留。所以添加一个得分垫底的框让fast_nms()能进行下去
        # extra_box = P.fill_constant((1, 4), 'float32', value=-1.0)
        # extra_score = P.fill_constant((P.shape(cur_scores)[0], 1), 'float32', value=-1.0)
        # extra_mask = P.fill_constant((1, P.shape(mask_data)[2]), 'float32', value=-1.0)
        # boxes = P.concat([boxes, extra_box], axis=0)
        # scores = P.concat([scores, extra_score], axis=1)
        # masks = P.concat([masks, extra_mask], axis=0)

        return self.fast_nms(boxes, scores, masks)
Ejemplo n.º 6
0
def compute_l2_normalized_weight(v, g, dim):
    shape = v.shape
    ndim = len(shape)

    if dim is None:
        v_normalized = v / (F.reduce_sum(F.square(v)) + 1e-12)
    elif dim == 0:
        param_matrix = F.reshape(v, (shape[0], np.prod(shape[1:])))
        v_normalized = F.l2_normalize(param_matrix, axis=1)
    elif dim == -1 or dim == ndim - 1:
        param_matrix = F.reshape(v, (np.prod(shape[:-1]), shape[-1]))
        v_normalized = F.l2_normalize(param_matrix, axis=0)
    else:
        perm = list(range(ndim))
        perm[0] = dim
        perm[dim] = 0
        transposed_param = F.transpose(v, perm)
        param_matrix = F.reshape(
            transposed_param,
            (transposed_param.shape[0], np.prod(transposed_param.shape[1:])))
        v_normalized = F.l2_normalize(param_matrix, axis=1)
        v_normalized = F.transpose(v_normalized, perm)
    v_normalized = F.reshape(v_normalized, shape)
    weight = F.elementwise_mul(v_normalized, g, axis=dim)
    return weight
Ejemplo n.º 7
0
    def forward(self, queries, keys, values, attn_bias, cache=None):
        # compute q ,k ,v
        keys = queries if keys is None else keys
        values = keys if values is None else values
        q = self.q_fc(queries)
        k = self.k_fc(keys)
        v = self.v_fc(values)
        # split head
        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
        v = layers.transpose(x=v, perm=[0, 2, 1, 3])

        if cache is not None:
            cache_k, cache_v = cache["k"], cache["v"]
            k = layers.concat([cache_k, k], axis=2)
            v = layers.concat([cache_v, v], axis=2)
            cache["k"], cache["v"] = k, v
        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.d_model**-0.5)
        if attn_bias is not None:
            product += attn_bias
        weights = layers.softmax(product)
        if self.dropout_rate:
            weights = layers.dropout(weights, dropout_prob=self.dropout_rate)
            out = layers.matmul(weights, v)
        out = layers.transpose(out, perm=[0, 2, 1, 3])
        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
        out = self.proj_fc(out)
        return out
Ejemplo n.º 8
0
    def forward(self, input):
        """
        Compute the mel spectrum.
        
        Args:
            input (Variable): shape(B, T, C), dtype float32, the result of mel linear projection. 
               
        Returns:
           output (Variable): shape(B, T, C), the result after postconvnet.
        """

        input = layers.transpose(input, [0, 2, 1])
        len = input.shape[-1]
        for i in range(self.num_conv - 1):
            batch_norm = self.batch_norm_list[i]
            conv = self.conv_list[i]

            input = layers.dropout(layers.tanh(
                batch_norm(conv(input)[:, :, :len])),
                                   self.dropout,
                                   dropout_implementation='upscale_in_train')
        conv = self.conv_list[self.num_conv - 1]
        input = conv(input)[:, :, :len]
        if self.batchnorm_last:
            batch_norm = self.batch_norm_list[self.num_conv - 1]
            input = layers.dropout(batch_norm(input),
                                   self.dropout,
                                   dropout_implementation='upscale_in_train')
        output = layers.transpose(input, [0, 2, 1])
        return output
Ejemplo n.º 9
0
def _weight_norm(v, g, dim):
    shape = v.shape
    ndims = len(shape)

    if dim is None:
        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
    elif dim == 0:
        p_matrix = F.reshape(v, (shape[0], -1))
        v_normalized = F.l2_normalize(p_matrix, axis=1)
        v_normalized = F.reshape(v_normalized, shape)
    elif dim == -1 or dim == ndims - 1:
        p_matrix = F.reshape(v, (-1, shape[-1]))
        v_normalized = F.l2_normalize(p_matrix, axis=0)
        v_normalized = F.reshape(v_normalized, shape)
    else:
        perm = list(range(ndims))
        perm[0] = dim
        perm[dim] = 0
        p_transposed = F.transpose(v, perm)
        transposed_shape = p_transposed.shape
        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
        v_normalized = F.l2_normalize(p_matrix, axis=1)
        v_normalized = F.reshape(v_normalized, transposed_shape)
        v_normalized = F.transpose(v_normalized, perm)
    weight = F.elementwise_mul(v_normalized,
                               g,
                               axis=dim if dim is not None else -1)
    return weight
Ejemplo n.º 10
0
    def forward(self, x, condition=None):
        """compute the output distribution (represented by its parameters).

        Args:
            x (Variable): shape(B, T), dtype float32, the input waveform.
            condition (Variable, optional): shape(B, C_cond, T), dtype float32, the upsampled condition. Defaults to None.

        Returns:
            Variable: shape(B, T, C_output), dtype float32, the parameter of the output distributions.
        """

        # Causal Conv
        if self.loss_type == "softmax":
            x = F.clip(x, min=-1., max=0.99999)
            x = quantize(x, self.output_dim)
            x = self.embed(x)  # (B, T, C)
        else:
            x = F.unsqueeze(x, axes=[-1])  # (B, T, 1)
            x = self.embed(x)  # (B, T, C)
        x = F.transpose(x, perm=[0, 2, 1])  # (B, C, T)

        # Residual & Skip-conenection & linears
        z = self.resnet(x, condition)

        z = F.transpose(z, [0, 2, 1])
        z = F.relu(self.proj2(F.relu(self.proj1(z))))

        y = self.proj3(z)
        return y
Ejemplo n.º 11
0
 def forward(self, seq):
     seq = layers.transpose(seq, [0, 2, 1])
     seq = layers.unsqueeze(seq, -1)
     seq = self.conv2d(seq)
     seq = layers.squeeze(seq, [-1])
     seq = layers.transpose(seq, [0, 2, 1])
     return seq
Ejemplo n.º 12
0
    def forward(self, queries, keys, values, attn_bias, past_cache):
        assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3
        #bsz, q_len, q_dim = queries.shape
        #bsz, k_len, k_dim = keys.shape
        #bsz, v_len, v_dim = values.shape
        #assert k_len == v_len

        q = self.q(queries)
        k = self.k(keys)
        v = self.v(values)

        cache = (k, v)
        if past_cache is not None:
            cached_k, cached_v = past_cache
            k = L.concat([cached_k, k], 1)
            v = L.concat([cached_v, v], 1)

        q = L.transpose(L.reshape(q, [0, 0, self.n_head, q.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim]
        k = L.transpose(L.reshape(k, [0, 0, self.n_head, k.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim]
        v = L.transpose(L.reshape(v, [0, 0, self.n_head, v.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim]


        q = L.scale(q, scale=self.d_key ** -0.5)
        score = L.matmul(q, k, transpose_y=True)
        if attn_bias is not None:
            score += attn_bias
        score = L.softmax(score, use_cudnn=True)
        score = self.dropout(score)

        out = L.matmul(score, v)
        out = L.transpose(out, [0, 2, 1, 3])
        out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]])

        out = self.o(out)
        return out, cache
Ejemplo n.º 13
0
    def add_input(self, x, condition=None):
        """compute the output distribution (represented by its parameters) for a step. It works similarily with the `forward` method but in a `step-in-step-out` fashion.

        Args:
            x (Variable): shape(B, T=1), dtype float32, a step of the input waveform.
            condition (Variable, optional): shape(B, C_cond, T=1), dtype float32, a step of the upsampled condition. Defaults to None.

        Returns:
            Variable: shape(B, T=1, C_output), dtype float32, the parameter of the output distributions.
        """
        # Causal Conv
        if self.loss_type == "softmax":
            x = F.clip(x, min=-1., max=0.99999)
            x = quantize(x, self.output_dim)
            x = self.embed(x)  # (B, T, C), T=1
        else:
            x = F.unsqueeze(x, axes=[-1])  # (B, T, 1), T=1
            x = self.embed(x)  # (B, T, C)
        x = F.transpose(x, perm=[0, 2, 1])

        # Residual & Skip-conenection & linears
        z = self.resnet.add_input(x, condition)
        z = F.transpose(z, [0, 2, 1])
        z = F.relu(self.proj2(F.relu(self.proj1(z))))  # (B, T, C)

        # Output
        y = self.proj3(z)
        return y
Ejemplo n.º 14
0
    def _prepare_qkv(self, queries, keys, values, cache=None):
        if keys is None:  # self-attention
            keys, values = queries, queries
            static_kv = False
        else:  # cross-attention
            static_kv = True

        q = self.q_fc(queries)
        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
        q = layers.transpose(x=q, perm=[0, 2, 1, 3])

        if cache is not None and static_kv and "static_k" in cache:
            # for encoder-decoder attention in inference and has cached
            k = cache["static_k"]
            v = cache["static_v"]
        else:
            k = self.k_fc(keys)
            v = self.v_fc(values)
            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
            v = layers.transpose(x=v, perm=[0, 2, 1, 3])

        if cache is not None:
            if static_kv and not "static_k" in cache:
                # for encoder-decoder attention in inference and has not cached
                cache["static_k"], cache["static_v"] = k, v
            elif not static_kv:
                # for decoder self-attention in inference
                cache_k, cache_v = cache["k"], cache["v"]
                k = layers.concat([cache_k, k], axis=2)
                v = layers.concat([cache_v, v], axis=2)
                cache["k"], cache["v"] = k, v

        return q, k, v
Ejemplo n.º 15
0
    def forward(self, src, mask, query_embed, pos_embed):
        # flatten NxCxHxW to HWxNxC
        bs, c, h, w = src.shape
        src = L.reshape(src, (bs, c, -1))  # [bs, c, h * w]
        src = L.transpose(src, (0, 2, 1))  # [bs, h * w, c]

        pos_embed = L.reshape(pos_embed,
                              (bs, pos_embed.shape[1], -1))  # [bs, c, h * w]
        pos_embed = L.transpose(pos_embed, (0, 2, 1))  # [bs, h * w, c]

        query_embed = L.unsqueeze(query_embed, [0])  # [1, num_queries, c_q]
        query_embed = L.expand(query_embed,
                               (bs, 1, 1))  # [bs, num_queries, c_q]

        mask = L.reshape(mask, (bs, -1))  # [bs, h * w]

        tgt = L.zeros_like(query_embed)  # [bs, num_queries, c_q]

        memory, encoder_attn_weights = self.encoder(
            src, src_mask=mask, pos=pos_embed)  # [bs, h * w, c]
        hs, decoder_attn_weights = self.decoder(tgt,
                                                memory,
                                                memory_mask=mask,
                                                pos=pos_embed,
                                                query_pos=query_embed)
        # hs: [num_inter, bs, num_queries, c_q]

        memory = L.transpose(memory, (0, 2, 1))  # [bs, c, h * w]
        memory = L.reshape(memory, (bs, c, h, w))  # [bs, c, h, w]
        return hs, memory, encoder_attn_weights, decoder_attn_weights
Ejemplo n.º 16
0
def _matrix_nms(bboxes, cate_labels, cate_scores, kernel='gaussian', sigma=2.0):
    """Matrix NMS for multi-class bboxes.
    Args:
        bboxes (Tensor): shape (n, 4)
        cate_labels (Tensor): shape (n), mask labels in descending order
        cate_scores (Tensor): shape (n), mask scores in descending order
        kernel (str):  'linear' or 'gaussian'
        sigma (float): std in gaussian method
    Returns:
        Tensor: cate_scores_update, tensors of shape (n)
    """
    n_samples = len(cate_labels)
    if n_samples == 0:
        return []

    # 计算一个n×n的IOU矩阵,两组矩形两两之间的IOU
    iou_matrix = jaccard(bboxes, bboxes)   # shape: [n_samples, n_samples]
    iou_matrix = paddle.triu(iou_matrix, diagonal=1)   # 只取上三角部分

    # label_specific matrix.
    cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1])   # shape: [n_samples, n_samples]
    # 第i行第j列表示的是第i个预测框和第j个预测框的类别id是否相同。我们抑制的是同类的预测框。
    d = cate_labels_x - L.transpose(cate_labels_x, [1, 0])
    d = L.pow(d, 2)   # 同类处为0,非同类处>0。 tf中用 == 0比较无效,所以用 < 1
    label_matrix = paddle.triu(L.cast(d < 1, 'float32'), diagonal=1)   # shape: [n_samples, n_samples]

    # IoU compensation
    # 非同类的iou置为0,同类的iou保留。逐列取最大iou
    compensate_iou = L.reduce_max(iou_matrix * label_matrix, [0, ])   # shape: [n_samples, ]
    # compensate_iou第0行里的值a0(重复了n_samples次)表示第0个物体与 比它分高 的 同类物体的最高iou为a0,
    # compensate_iou第1行里的值a1(重复了n_samples次)表示第1个物体与 比它分高 的 同类物体的最高iou为a1,...
    # compensate_iou里每一列里的值依次代表第0个物体、第1个物体、...、第n_samples-1个物体与 比它自己分高 的 同类物体的最高iou。
    compensate_iou = L.transpose(L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]), [1, 0])   # shape: [n_samples, n_samples]

    # IoU decay
    # 非同类的iou置为0,同类的iou保留。
    # decay_iou第i行第j列表示的是第i个预测框和第j个预测框的iou,如果不是同类,该iou置0。且只取上三角部分。
    decay_iou = iou_matrix * label_matrix   # shape: [n_samples, n_samples]

    # matrix nms
    if kernel == 'gaussian':
        decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2))
        compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2))
        decay_coefficient = L.reduce_sum(decay_matrix / compensate_matrix, [0, ])
    elif kernel == 'linear':
        # 看第j列。(1_test_matrixnms.py里的例子,看第2列)
        # decay_iou     里第2列里的值为[0.9389, 0.9979, 0,      0]。第2个物体与比它分高的2个同类物体的iou是0.9389, 0.9979。
        # compensate_iou里第2列里的值为[0,      0.9409, 0.9979, 0]。比第2个物体分高的2个同类物体 与 比它们自己分高 的 同类物体的最高iou 是0,      0.9409。
        # decay_matrix  里第2列里的值为[0.0610, 0.0348, 485.28, 1]。取该列的最小值为0.0348(抑制掉第2个物体的是第1个物体)。其实后面2个值不用看,因为它们总是>=1。
        # 总结:decay_matrix里第j列里的第i个值若为最小值,则抑制掉第j个物体的是第i个物体。
        # 而且,表现为decay_iou尽可能大,decay_matrix才会尽可能小。
        decay_matrix = (1-decay_iou)/(1-compensate_iou)
        decay_coefficient = L.reduce_min(decay_matrix, [0, ])
    else:
        raise NotImplementedError

    # 更新分数
    cate_scores_update = cate_scores * decay_coefficient
    return cate_scores_update
Ejemplo n.º 17
0
 def cal_kv(self, keys, values):
     k = self.k_fc(keys)
     v = self.v_fc(values)
     k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
     k = layers.transpose(x=k, perm=[0, 2, 1, 3])
     v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
     v = layers.transpose(x=v, perm=[0, 2, 1, 3])
     return k, v
Ejemplo n.º 18
0
    def forward(self, key, value, query_input, mask=None, query_mask=None):
        """
        Compute attention.
        
        Args:
            key (Variable): shape(B, T, C), dtype float32, the input key of attention.
            value (Variable): shape(B, T, C), dtype float32, the input value of attention.
            query_input (Variable): shape(B, T, C), dtype float32, the input query of attention.
            mask (Variable, optional): shape(B, T_query, T_key), dtype float32, the mask of key. Defaults to None.
            query_mask (Variable, optional): shape(B, T_query, T_key), dtype float32, the mask of query. Defaults to None.
                
        Returns:
            result (Variable): shape(B, T, C), the result of mutihead attention. 
            attention (Variable): shape(num_head * B, T, C), the attention of key and query. 
        """

        batch_size = key.shape[0]
        seq_len_key = key.shape[1]
        seq_len_query = query_input.shape[1]

        # Make multihead attention
        key = layers.reshape(
            self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
        value = layers.reshape(
            self.value(value),
            [batch_size, seq_len_key, self.num_head, self.d_k])
        query = layers.reshape(
            self.query(query_input),
            [batch_size, seq_len_query, self.num_head, self.d_q])

        key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]),
                             [-1, seq_len_key, self.d_k])
        value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]),
                               [-1, seq_len_key, self.d_k])
        query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]),
                               [-1, seq_len_query, self.d_q])

        result, attention = self.scal_attn(key,
                                           value,
                                           query,
                                           mask=mask,
                                           query_mask=query_mask)

        # concat all multihead result
        result = layers.reshape(
            result, [self.num_head, batch_size, seq_len_query, self.d_q])
        result = layers.reshape(layers.transpose(result, [1, 2, 0, 3]),
                                [batch_size, seq_len_query, -1])
        if self.is_concat:
            result = layers.concat([query_input, result], axis=-1)
        result = layers.dropout(self.fc(result),
                                self.dropout,
                                dropout_implementation='upscale_in_train')
        result = result + query_input

        result = self.layer_norm(result)
        return result, attention
Ejemplo n.º 19
0
def simple_rnn(rnn_input,
               init_hidden,
               hidden_size,
               kernel_param_attr=None,
               recurrent_param_attr=None,
               bias_attr=None,
               act='relu',
               sequence_length=None,
               name='simple_rnn'):

    # Transpose (sequence x batch x hidden)
    rnn_input = layers.transpose(rnn_input, [1, 0, 2])

    # Generate Mask
    mask = None
    if sequence_length:
        max_seq_len = layers.shape(rnn_input)[0]
        mask = layers.sequence_mask(sequence_length,
                                    maxlen=max_seq_len,
                                    dtype='float32')
        mask = layers.transpose(mask, [1, 0])

    # Init
    simple_rnn = SimpleRNN_unit(rnn_input, hidden_size, kernel_param_attr,
                                recurrent_param_attr, bias_attr, act)

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(rnn_input)

        if mask:
            step_mask = rnn.step_input(mask)

        if init_hidden:
            pre_hidden = rnn.memory(init=init_hidden)
        else:
            pre_hidden = rnn.memory(batch_ref=rnn_input,
                                    shape=[-1, hidden_size])

        last_hidden = simple_rnn(step_in, pre_hidden)

        rnn.update_memory(pre_hidden, last_hidden)

        rnn.step_output(last_hidden)

        step_input = last_hidden

    rnn_out = rnn()

    last_hidden = rnn_out[-1]
    last_hidden = layers.reshape(last_hidden, shape=[1, -1, hidden_size])

    rnn_output = layers.transpose(rnn_out, [1, 0, 2])
    last_hidden = layers.transpose(last_hidden, [1, 0, 2])

    return rnn_out, last_hidden
Ejemplo n.º 20
0
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None):
    """Matrix NMS for multi-class masks.

    Args:
        seg_masks (Tensor): shape (n, h, w)   0、1组成的掩码
        cate_labels (Tensor): shape (n), mask labels in descending order
        cate_scores (Tensor): shape (n), mask scores in descending order
        kernel (str):  'linear' or 'gauss'
        sigma (float): std in gaussian method
        sum_masks (Tensor):  shape (n, )      n个物体的面积

    Returns:
        Tensor: cate_scores_update, tensors of shape (n)
    """
    n_samples = L.shape(cate_labels)[0]   # 物体数
    seg_masks = L.reshape(seg_masks, (n_samples, -1))   # [n, h*w]
    # inter.
    inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True)   # [n, n] 自己乘以自己的转置。两两之间的交集面积。
    # union.
    sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1])     # [n, n]  sum_masks重复了n行得到sum_masks_x
    # iou.
    iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix)
    rows = L.range(0, n_samples, 1, 'int32')
    cols = L.range(0, n_samples, 1, 'int32')
    rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1])
    cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples])
    tri_mask = L.cast(rows > cols, 'float32')
    iou_matrix = tri_mask * iou_matrix   # [n, n]   只取上三角部分

    # label_specific matrix.
    cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1])     # [n, n]  cate_labels重复了n行得到cate_labels_x
    label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32')
    label_matrix = tri_mask * label_matrix   # [n, n]   只取上三角部分

    # IoU compensation
    compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0)
    compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1])     # [n, n]
    compensate_iou = L.transpose(compensate_iou, [1, 0])      # [n, n]

    # IoU decay
    decay_iou = iou_matrix * label_matrix

    # # matrix nms
    if kernel == 'gaussian':
        decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2))
        compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2))
        decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0)
    elif kernel == 'linear':
        decay_matrix = (1-decay_iou)/(1-compensate_iou)
        decay_coefficient = L.reduce_min(decay_matrix, dim=0)
    else:
        raise NotImplementedError

    # update the score.
    cate_scores_update = cate_scores * decay_coefficient
    return cate_scores_update
def rnn_decoder(gru_unit,
                cue_gru_unit,
                input,
                input_size,
                hidden_size,
                num_layers,
                memory,
                memory_mask,
                knowledge,
                output_size,
                init_hidden=None,
                mask=None,
                dropout=0.0,
                batch_first=True,
                name="decoder"):
    """ rnn decoder """
    input_emb = get_embedding(input, input_size, output_size)
    if batch_first:
        input_emb = layers.transpose(input_emb, perm=[1, 0, 2])
        if mask:
            trans_mask = layers.transpose(mask, perm=[1, 0])

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(input_emb)
        step_mask = None

        if mask:
            step_mask = rnn.step_input(trans_mask)

        # split pre_hidden
        pre_hidden_list = []

        pre_hidden = rnn.memory(init=init_hidden)
        real_out, last_hidden = \
            decoder_step(gru_unit, cue_gru_unit, step_in, pre_hidden, input_size,
                         hidden_size, memory, memory_mask, knowledge, mask=step_mask)

        rnn.update_memory(pre_hidden, last_hidden)

        step_in = layers.squeeze(real_out, axes=[1])
        rnn.step_output(step_in)

    rnnout = rnn()
    rnnout = layers.transpose(rnnout, perm=[1, 0, 2])
    rnnout = layers.elementwise_mul(rnnout, mask, axis=0)

    output_in_size = hidden_size + hidden_size
    rnnout = layers.dropout(rnnout, dropout_prob=dropout)
    rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1')
    rnnout = fc(rnnout, hidden_size, output_size, name='dec_out_fc2')

    softmax_out = layers.softmax(rnnout)

    return softmax_out
        def dot_attention(query, memory, mask=None):
            attn = layers.matmul(query, memory, transpose_y=True)

            if mask:
                attn = layers.transpose(attn, [1, 0, 2])
                attn = layers.elementwise_add(attn, mask * 1000000000, -1)
                attn = layers.transpose(attn, [1, 0, 2])
            weight = layers.softmax(attn)
            weight_memory = layers.matmul(weight, memory)

            return weight_memory, weight
Ejemplo n.º 23
0
    def forward(self, input):

        x = self.DownBlock(input)

        gap = adaptive_pool2d(x, pool_size=[1, 1], pool_type='avg')

        gap_ = reshape(x=gap, shape=(x.shape[0], -1))

        gap_logit = self.gap_fc(gap_)

        gap_weight = self.gap_fc.parameters()[0]
        gap_weight = transpose(gap_weight, perm=[1, 0])
        gap_weight = unsqueeze(gap_weight, axes=2)
        gap_weight = unsqueeze(gap_weight, axes=3)

        gap = x * gap_weight

        gmp = adaptive_pool2d(x, pool_size=[1, 1], pool_type='max')

        gmp_ = reshape(x=gmp, shape=(x.shape[0], -1))

        gmp_logit = self.gmp_fc(gmp_)

        gmp_weight = self.gmp_fc.parameters()[0]
        gmp_weight = transpose(gmp_weight, perm=[1, 0])
        gmp_weight = unsqueeze(gmp_weight, axes=2)
        gmp_weight = unsqueeze(gmp_weight, axes=3)

        gmp = x * gmp_weight

        cam_logit = concat(input=[gap_logit, gmp_logit], axis=1)

        x = concat(input=[gap, gmp], axis=1)

        x = self.relu(self.conv1x1(x))

        heatmap = reduce_sum(x, dim=1, keep_dim=True)

        if self.light:
            x_ = adaptive_pool2d(x, pool_size=[1, 1], pool_type='avg')
            x_ = reshape(x=x_, shape=(x_.shape[0], -1))
            x_ = self.FC(x_)
        else:
            x_ = reshape(x, shape=(x.shape[0], -1))
            x_ = self.FC(x_)

        gamma, beta = self.gamma(x_), self.beta(x_)

        for i in range(self.n_blocks):
            x = getattr(self, 'UpBlock1_' + str(i + 1))(x, gamma, beta)
        out = self.UpBlock2(x)

        return out, cam_logit, heatmap
def _relative_attention_inner(q, k, v, transpose):
    batch_size = layers.shape(q)[0]
    heads = layers.shape(q)[1]
    length = layers.shape(q)[2]

    xy_matmul = layers.matmul(q, k, transpose_y=transpose)
    x_t = layers.transpose(q, [2, 0, 1, 3])
    x_t_r = layers.reshape(x_t, [length, batch_size * heads, -1])
    x_tz_matmul = layers.matmul(x_t_r, v, transpose_y=transpose)
    x_tz_matmul_r = layers.reshape(x_tz_matmul,
                                   [length, batch_size, heads, -1])
    x_tz_matmul_r_t = layers.transpose(x_tz_matmul_r, [1, 2, 0, 3])
    return xy_matmul + x_tz_matmul_r_t
def _attn_forward(self,
                  queries,
                  keys,
                  values,
                  attn_bias,
                  past_cache,
                  head_mask=None):
    assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3

    q = self.q(queries)
    k = self.k(keys)
    v = self.v(values)

    cache = (k, v)
    if past_cache is not None:
        cached_k, cached_v = past_cache
        k = L.concat([cached_k, k], 1)
        v = L.concat([cached_v, v], 1)

    if hasattr(self.q, 'fn') and self.q.fn.cur_config['expand_ratio'] != None:
        n_head = int(self.n_head * self.q.fn.cur_config['expand_ratio'])
    else:
        n_head = self.n_head

    q = L.transpose(
        L.reshape(q, [0, 0, n_head, q.shape[-1] // n_head]),
        [0, 2, 1, 3])  #[batch, head, seq, dim]
    k = L.transpose(
        L.reshape(k, [0, 0, n_head, k.shape[-1] // n_head]),
        [0, 2, 1, 3])  #[batch, head, seq, dim]
    v = L.transpose(
        L.reshape(v, [0, 0, n_head, v.shape[-1] // n_head]),
        [0, 2, 1, 3])  #[batch, head, seq, dim]

    q = L.scale(q, scale=self.d_key**-0.5)
    score = L.matmul(q, k, transpose_y=True)
    if attn_bias is not None:
        score += attn_bias

    score = L.softmax(score, use_cudnn=True)
    score = self.dropout(score)
    if head_mask is not None:
        score = score * head_mask

    out = L.matmul(score, v)
    out = L.transpose(out, [0, 2, 1, 3])
    out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]])

    out = self.o(out)
    return out, cache
def gru_rnn(input,
            input_size,
            hidden_size,
            init_hidden=None,
            batch_first=False,
            mask=None,
            num_layers=1,
            dropout=0.0,
            name="gru"):
    """ gru rnn """

    gru_unit = GRU_unit(input_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=dropout,
                        name=name + "_gru_unit")

    if batch_first:
        input = layers.transpose(x=input, perm=[1, 0, 2])
        if mask:
            mask = layers.transpose(mask, perm=[1, 0])

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(input)
        step_mask = None

        if mask:
            step_mask = rnn.step_input(mask)

        pre_hidden = rnn.memory(init=init_hidden)
        new_hidden, last_hidden = gru_unit(step_in, pre_hidden, step_mask)
        rnn.update_memory(pre_hidden, last_hidden)
        step_in = new_hidden
        rnn.step_output(step_in)
        rnn.step_output(last_hidden)

    rnn_res = rnn()
    rnn_out = rnn_res[0]
    last_hidden = layers.slice(rnn_res[1],
                               axes=[0],
                               starts=[-1],
                               ends=[1000000000])
    last_hidden = layers.reshape(last_hidden,
                                 shape=[num_layers, -1, hidden_size])

    if batch_first:
        rnnout = layers.transpose(x=rnn_out, perm=[1, 0, 2])

    return rnnout, last_hidden
Ejemplo n.º 27
0
    def forward(self, x):
        x = layers.transpose(x, perm=[0, 2, 1, 3, 4])
        x = fluid.layers.pool3d(x,
                                pool_size=(3, 1, 1),
                                pool_type='avg',
                                pool_stride=(2, 1, 1))
        b, c, t, h, w = x.shape
        x = layers.transpose(x, perm=[0, 2, 1, 3, 4])
        x = layers.reshape(x, shape=[b * t, c, h, w])
        x = self.stem(x)
        #print(self.stem.weight.numpy().sum())
        x = self.bn1(x)
        x = layers.pool2d(x,
                          pool_size=3,
                          pool_type='max',
                          pool_stride=2,
                          pool_padding=1)
        x = self.res2(x)
        x = self.res3(x)
        bt, c, h, w = x.shape
        x = layers.reshape(x, shape=[b, t, c, h, w])
        x = layers.transpose(x, perm=[0, 2, 1, 3, 4])
        x = fluid.layers.pool3d(x,
                                pool_size=(3, 1, 1),
                                pool_type='avg',
                                pool_stride=(2, 1, 1))
        b, c, t, h, w = x.shape
        x = layers.transpose(x, perm=[0, 2, 1, 3, 4])
        res = layers.reshape(x[:, 1:-1], shape=[-1, c, h, w])
        x = layers.reshape(x, shape=[b * t, c, h, w])
        x = self.rep_flow(x)
        x = self.flow_conv(x)
        x = self.rep_flow2(x)
        x = layers.relu(res + x)
        x = self.res4(x)
        x = self.res5(x)

        x = self.dropout(x)
        x = layers.reduce_mean(x, dim=3)
        x = layers.reduce_mean(x, dim=2)

        x = layers.reshape(x, shape=[x.shape[0], -1])
        x = self.classify(x)

        x = layers.reshape(x, shape=[b, -1, self.num_classes])

        x = layers.reduce_mean(x, dim=1)
        return x
Ejemplo n.º 28
0
def PredictionModule(x,
                     num_priors,
                     num_classes,
                     mask_dim,
                     shared_conv_w,
                     shared_conv_b,
                     shared_bbox_w,
                     shared_bbox_b,
                     shared_conf_w,
                     shared_conf_b,
                     shared_mask_w,
                     shared_mask_b):
    '''
    改编自DSSD算法中的PredictionModule,改成了3x3卷积。3个分支分别预测bbox、conf、mask系数。
               x
             / | \
        bbox conf mask
    '''
    x = P.conv2d(x, 256, filter_size=(3, 3), stride=1, padding=1,
                 param_attr=shared_conv_w,
                 bias_attr=shared_conv_b)
    x = P.relu(x)

    bbox_x = x
    conf_x = x
    mask_x = x

    bbox = P.conv2d(bbox_x, num_priors * 4, filter_size=(3, 3), stride=1, padding=1,
                    param_attr=shared_bbox_w,
                    bias_attr=shared_bbox_b)
    bbox = P.transpose(bbox, perm=[0, 2, 3, 1])
    bbox = P.reshape(bbox, (P.shape(bbox)[0], -1, 4))

    conf = P.conv2d(conf_x, num_priors * num_classes, filter_size=(3, 3), stride=1, padding=1,
                    param_attr=shared_conf_w,
                    bias_attr=shared_conf_b)
    conf = P.transpose(conf, perm=[0, 2, 3, 1])
    conf = P.reshape(conf, (P.shape(conf)[0], -1, num_classes))

    mask = P.conv2d(mask_x, num_priors * mask_dim, filter_size=(3, 3), stride=1, padding=1,
                    param_attr=shared_mask_w,
                    bias_attr=shared_mask_b)
    mask = P.transpose(mask, perm=[0, 2, 3, 1])
    mask = P.reshape(mask, (P.shape(mask)[0], -1, mask_dim))
    mask = P.tanh(mask)

    preds = {'loc': bbox, 'conf': conf, 'mask': mask}
    return preds
Ejemplo n.º 29
0
    def create_cam_op(self, predict, class_dim, heatmaps):
        """compute loss with tensor

         Args:
         predict: model output tensor activated by softmax
         class_dim: dim of multi-class vector
         heatmaps: 全局池化前的特征图

         Returns:
         heatmaps: class activation map
         """
        if self.main_arch in DenseNetModels:
            weights_shape = 1024
            name = "fc_weights"
        elif self.main_arch == "xception":
            weights_shape = 2048
            name = "fc_weights"
        else:
            raise ValueError(
                "Calc CAM of model arch {} is not supported.".format(
                    self.main_arch))

        fc_weights = FL.create_parameter(shape=[weights_shape, class_dim],
                                         dtype='float32',
                                         name=name)  # 1024, 5

        pred_idx = FL.argmax(predict, 1)  # bs, 1
        fc_weights = FL.transpose(fc_weights, perm=[1, 0])  # 5, 1024
        fc_weights = FL.gather(fc_weights, index=pred_idx)  # bs, 1024

        heatmaps = heatmaps * fc_weights  # bs, 1024, 16, 16
        heatmaps = FL.reduce_sum(heatmaps, dim=1, keep_dim=False)

        return heatmaps
Ejemplo n.º 30
0
def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
    print("[synthesize] {}".format(sentence))
    text = en.text_to_sequence(sentence, p=1.0)
    text = np.expand_dims(np.array(text, dtype="int64"), 0)
    lengths = np.array([text.size], dtype=np.int64)
    text_seqs = dg.to_variable(text)
    text_lengths = dg.to_variable(lengths)

    decoder_layers = config["decoder_layers"]
    force_monotonic_attention = [False] * decoder_layers
    for i in monotonic_layers:
        force_monotonic_attention[i] = True

    with dg.no_grad():
        outputs = model(text_seqs,
                        text_lengths,
                        speakers=None,
                        force_monotonic_attention=force_monotonic_attention,
                        window=(config["backward_step"],
                                config["forward_step"]))
        decoded, refined, attentions = outputs
        if args.vocoder == "griffin-lim":
            wav_np = vocoder(refined.numpy()[0].T)
        else:
            wav = vocoder(F.transpose(refined, (0, 2, 1)))
            wav_np = wav.numpy()[0]
    return wav_np
Ejemplo n.º 31
0
    def __combine_heads(x):
        """
        Transpose and then reshape the last two dimensions of inpunt tensor x
        so that it becomes one dimension, which is reverse to __split_heads.
        """
        if len(x.shape) == 3: return x
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        # FIXME(guosheng): Decouple the program desc with batch_size.
        return layers.reshape(
            x=trans_x,
            shape=map(int,
                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
Ejemplo n.º 32
0
    def __split_heads(x, n_head):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions and then transpose. Specifically, input a tensor with shape
        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
        with shape [bs, n_head, max_sequence_length, hidden_dim].
        """
        if n_head == 1:
            return x

        hidden_size = x.shape[-1]
        # FIXME(guosheng): Decouple the program desc with batch_size.
        reshaped = layers.reshape(
            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])