Example #1
0
    def forward(self, audio, mel, audio_start, clip_kl=True):
        """Compute loss of Clarinet model.

        Args:
            audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform.
            mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here).
            audio_start (Variable): shape(B, ), dtype int64, audio starts positions.
            clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True.

        Returns:
            Dict(str, Variable)
            loss (Variable): shape(1, ), dtype flaot32, total loss.
            kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution.
            regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence.
            spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform.
        """
        batch_size, audio_length = audio.shape  # audio clip's length

        z = F.gaussian_random(audio.shape)
        condition = self.encoder(mel)  # (B, C, T)
        condition_slice = crop(condition, audio_start, audio_length)

        x, s_means, s_scales = self.student(z, condition_slice)  # all [0: T]
        s_means = s_means[:, 1:]  # (B, T-1), time steps [1: T]
        s_scales = s_scales[:, 1:]  # (B, T-1), time steps [1: T]
        s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.)

        # teacher outputs single gaussian
        y = self.teacher(x[:, :-1], condition_slice[:, :, 1:])
        _, t_means, t_scales = F.split(y, 3, -1)  # time steps [1: T]
        t_means = F.squeeze(t_means, [-1])  # (B, T-1), time steps [1: T]
        t_scales = F.squeeze(t_scales, [-1])  # (B, T-1), time steps [1: T]
        t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.)

        s_distribution = D.Normal(s_means, F.exp(s_clipped_scales))
        t_distribution = D.Normal(t_means, F.exp(t_clipped_scales))

        # kl divergence loss, so we only need to sample once? no MC
        kl = s_distribution.kl_divergence(t_distribution)
        if clip_kl:
            kl = F.clip(kl, -100., 10.)
        # context size dropped
        kl = F.reduce_mean(kl[:, self.teacher.context_size:])
        # major diff here
        regularization = F.mse_loss(t_scales[:, self.teacher.context_size:],
                                    s_scales[:, self.teacher.context_size:])

        # introduce information from real target
        spectrogram_frame_loss = F.mse_loss(self.stft.magnitude(audio),
                                            self.stft.magnitude(x))
        loss = kl + self.lmd * regularization + spectrogram_frame_loss
        loss_dict = {
            "loss": loss,
            "kl_divergence": kl,
            "regularization": regularization,
            "stft_loss": spectrogram_frame_loss
        }
        return loss_dict
Example #2
0
def focal_loss(pred, label, alpha=0.25, gamma=2, epsilon=1e-6):
    '''
        alpha 变大,对前景类惩罚变大,更加重视
        gamma 变大,对信心大的例子更加忽略,学习难的例子
    '''
    pred = clip(pred, epsilon, 1 - epsilon)
    label = clip(label, epsilon, 1 - epsilon)
    loss = -1 * (alpha * layers.pow(
        (1 - pred), gamma) * label * layers.log(pred) +
                 (1 - alpha) * layers.pow(pred, gamma) *
                 (1 - label) * log(1 - pred))
    return loss
Example #3
0
    def ffffffffffffffffffff(self, pred, target):
        '''
        输入矩形的格式是cx cy w h
        '''
        assert pred.shape[0] == target.shape[0]

        pred = L.reshape(pred, [-1, 4])
        target = L.reshape(target, [-1, 4])

        pred = L.cast(pred, 'float32')
        target = L.cast(target, 'float32')

        # 相交矩形左上角坐标
        tl = L.elementwise_max((pred[:, :2] - pred[:, 2:] / 2),
                               (target[:, :2] - target[:, 2:] / 2))
        # 相交矩形右下角坐标
        br = L.elementwise_min((pred[:, :2] + pred[:, 2:] / 2),
                               (target[:, :2] + target[:, 2:] / 2))

        area_p = paddle.prod(pred[:, 2:], 1)  # 预测框的面积
        area_g = paddle.prod(target[:, 2:], 1)  # gt框的面积

        # 相交矩形是否存在?
        # en = (tl < br).type(tl.type()).prod(dim=1)
        en = L.cast(tl < br, 'float32')
        en = paddle.prod(en, 1)  # 相交矩形是否存在?

        area_i = paddle.prod(br - tl, 1) * en
        area_u = area_p + area_g - area_i
        iou = (area_i) / (area_u + 1e-16)

        if self.loss_type == "iou":
            loss = 1 - iou**2
        elif self.loss_type == "giou":
            c_tl = L.elementwise_min((pred[:, :2] - pred[:, 2:] / 2),
                                     (target[:, :2] - target[:, 2:] / 2))
            c_br = L.elementwise_max((pred[:, :2] + pred[:, 2:] / 2),
                                     (target[:, :2] + target[:, 2:] / 2))
            area_c = paddle.prod(c_br - c_tl, 1)

            # area_c限制在区间[1e-16, np.inf]内
            area_c = L.clip(area_c, 1e-16, np.inf)
            giou = iou - (area_c - area_u) / area_c
            # giou限制在区间[-1.0, 1.0]内
            giou = L.clip(giou, -1.0, 1.0)
            loss = 1 - giou
        if self.reduction == "mean":
            loss = loss.mean()
        elif self.reduction == "sum":
            loss = loss.sum()

        return loss
Example #4
0
def _de_sigmoid(x, eps=1e-7):
    # x限制在区间[eps, 1 / eps]内
    x = L.clip(x, eps, 1 / eps)

    # 先取倒数再减一
    x = 1.0 / x - 1.0

    # e^(-x)限制在区间[eps, 1 / eps]内
    x = L.clip(x, eps, 1 / eps)

    # 取对数再取相反数
    x = -L.log(x)
    return x
Example #5
0
    def __call__(self, module):

        if hasattr(module, 'rho'):
            # TODO 不确定这样直接在param上应用clip是否可以
            w = module.rho
            w = layers.clip(w, self.clip_min, self.clip_max)
            module.rho = w
Example #6
0
def generalied_box_iou(boxes1, boxes2):
    """
    Generalized IoU from https://giou.stanford.edu/

    The boxes should be in [x0, y0, x1, y1] format

    Returns a [N, M] pairwise matrix, where N = len(boxes1)
    and M = len(boxes2)
    """
    # degenerate boxes gives inf / nan results
    # so do an early check
    assert L.reduce_all(boxes1[:, 2:] >= boxes1[:, :2])
    assert L.reduce_all(boxes2[:, 2:] >= boxes2[:, :2])
    iou, union = box_iou(boxes1, boxes2)

    N, M = boxes1.shape[0], boxes2.shape[0]
    boxes1 = L.unsqueeze(boxes1, axes=[1])  # [N, 1, 4]
    boxes1 = L.expand(boxes1, [1, M, 1])  # [N, M, 4]
    boxes2 = L.unsqueeze(boxes2, axes=[0])  # [1, M, 4]
    boxes2 = L.expand(boxes2, [N, 1, 1])  # [N, M, 4]
    lt = L.elementwise_min(boxes1[:, :, :2], boxes2[:, :, :2])  # [N, M, 2]
    rb = L.elementwise_max(boxes1[:, :, 2:], boxes2[:, :, 2:])  # [N, M, 2]

    wh = L.clip(rb - lt, min=0, max=1e8)  # [N, M, 2]
    area = wh[:, :, 0] * wh[:, :, 1] + 1e-4  # prevent devided by zero

    return iou - (area - union) / area
Example #7
0
    def forward(self, x, condition=None):
        """compute the output distribution (represented by its parameters).

        Args:
            x (Variable): shape(B, T), dtype float32, the input waveform.
            condition (Variable, optional): shape(B, C_cond, T), dtype float32, the upsampled condition. Defaults to None.

        Returns:
            Variable: shape(B, T, C_output), dtype float32, the parameter of the output distributions.
        """

        # Causal Conv
        if self.loss_type == "softmax":
            x = F.clip(x, min=-1., max=0.99999)
            x = quantize(x, self.output_dim)
            x = self.embed(x)  # (B, T, C)
        else:
            x = F.unsqueeze(x, axes=[-1])  # (B, T, 1)
            x = self.embed(x)  # (B, T, C)
        x = F.transpose(x, perm=[0, 2, 1])  # (B, C, T)

        # Residual & Skip-conenection & linears
        z = self.resnet(x, condition)

        z = F.transpose(z, [0, 2, 1])
        z = F.relu(self.proj2(F.relu(self.proj1(z))))

        y = self.proj3(z)
        return y
Example #8
0
    def add_input(self, x, condition=None):
        """compute the output distribution (represented by its parameters) for a step. It works similarily with the `forward` method but in a `step-in-step-out` fashion.

        Args:
            x (Variable): shape(B, T=1), dtype float32, a step of the input waveform.
            condition (Variable, optional): shape(B, C_cond, T=1), dtype float32, a step of the upsampled condition. Defaults to None.

        Returns:
            Variable: shape(B, T=1, C_output), dtype float32, the parameter of the output distributions.
        """
        # Causal Conv
        if self.loss_type == "softmax":
            x = F.clip(x, min=-1., max=0.99999)
            x = quantize(x, self.output_dim)
            x = self.embed(x)  # (B, T, C), T=1
        else:
            x = F.unsqueeze(x, axes=[-1])  # (B, T, 1), T=1
            x = self.embed(x)  # (B, T, C)
        x = F.transpose(x, perm=[0, 2, 1])

        # Residual & Skip-conenection & linears
        z = self.resnet.add_input(x, condition)
        z = F.transpose(z, [0, 2, 1])
        z = F.relu(self.proj2(F.relu(self.proj1(z))))  # (B, T, C)

        # Output
        y = self.proj3(z)
        return y
Example #9
0
def fuse_math_min_mean_neg(x):
    """
    Fuse operation min mean for hinge loss computation of negative samples
    """
    minval = L.clip(-x - 1, -1e8, 0)
    loss = - L.reduce_mean(minval)
    return loss
Example #10
0
    def compute_mask_loss(self, occ_mask, warped_image, tgt_image):
        """
        Compute losses on the generated occlusion mask.

        Args:
            occ_mask (tensor): Generated occlusion masks.
            warped_image (tensor): Warped image using the flow map.
            tgt_image (tensor): Target image for the warped image.
        Returns:
            (tensor): Loss for the mask.
        """
        loss_mask = dg.to_variable(np.zeros((1, )).astype("float32"))
        if occ_mask is not None:
            dummy0 = L.zeros_like(occ_mask)
            dummy1 = L.ones_like(occ_mask)

            # Compute the confidence map based L1 distance between warped and GT image.
            img_diff = L.reduce_sum(L.abs(warped_image - tgt_image),
                                    1,
                                    keep_dim=True)

            conf = L.clip(1 - img_diff, 0, 1)

            # Force mask value to be small if warped image is similar to GT, and vice versa.
            loss_mask = self.criterionMasked(occ_mask, dummy0, conf)
            loss_mask += self.criterionMasked(occ_mask, dummy1, 1 - conf)

        return loss_mask
Example #11
0
    def sample_from_mog(self, y):
        """Sample from the output distribution where the output distribution is a mixture of Gaussians.
        Args:
            y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.

        Returns:
            Variable: shape(B, T), waveform sampled from the output distribution.
        """
        batch_size, time_steps, output_dim = y.shape
        n_mixture = output_dim // 3

        w, mu, log_std = F.split(y, 3, dim=-1)

        reshaped_w = F.reshape(w, (batch_size * time_steps, n_mixture))
        prob_ids = F.sampling_id(F.softmax(reshaped_w))
        prob_ids = F.reshape(prob_ids, (batch_size, time_steps))
        prob_ids = prob_ids.numpy()

        index = np.array([[[b, t, prob_ids[b, t]] for t in range(time_steps)]
                          for b in range(batch_size)]).astype("int32")
        index_var = dg.to_variable(index)

        mu_ = F.gather_nd(mu, index_var)
        log_std_ = F.gather_nd(log_std, index_var)

        dist = D.Normal(mu_, F.exp(log_std_))
        samples = dist.sample(shape=[])
        samples = F.clip(samples, min=-1., max=1.)
        return samples
Example #12
0
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        # clip by value first
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip(x=g,
                                   min=-self.clip_value,
                                   max=self.clip_value)
            params_and_grads.append((p, new_grad))
        params_grads = params_and_grads

        # clip by global norm
        params_and_grads = []
        sum_square_list = []
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

        # all parameters have been filterd out
        if len(sum_square_list) == 0:
            return params_grads

        global_norm_var = layers.concat(sum_square_list)
        global_norm_var = layers.reduce_sum(global_norm_var)
        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(shape=[1],
                                               dtype='float32',
                                               value=self.clip_norm)
        clip_var = layers.elementwise_div(x=max_global_norm,
                                          y=layers.elementwise_max(
                                              x=global_norm_var,
                                              y=max_global_norm))
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

        return params_and_grads
Example #13
0
    def forward(self, input):
        rho_ = L.clip(self.rho, min=0, max=1)
        in_mean = L.reduce_mean(input, dim=[2, 3], keep_dim=True)
        in_var = var(input, dim=[2, 3], keepdim=True)
        out_in = (input - in_mean) / L.sqrt(in_var + self.eps)
        ln_mean = L.reduce_mean(input, dim=[1, 2, 3], keep_dim=True)
        ln_var = var(input, dim=[1, 2, 3], keepdim=True)
        out_ln = (input - ln_mean) / L.sqrt(ln_var + self.eps)
        out = rho_ * out_in + (1 - rho_) * out_ln
        out = out * self.gamma + self.beta

        return out
Example #14
0
    def forward(self, input, gamma, beta):
        rho_ = L.clip(self.rho, min=0, max=1)
        in_mean = L.reduce_mean(input, dim=[2, 3], keep_dim=True)
        in_var = var(input, dim=[2, 3], keepdim=True)
        out_in = (input - in_mean) / L.sqrt(in_var + self.eps)
        ln_mean = L.reduce_mean(input, dim=[1, 2, 3], keep_dim=True)
        ln_var = var(input, dim=[1, 2, 3], keepdim=True)
        out_ln = (input - ln_mean) / L.sqrt(ln_var + self.eps)
        out = rho_ * out_in + (1 - rho_) * out_ln
        out = out * L.unsqueeze(gamma, axes=[2, 3]) + L.unsqueeze(beta,
                                                                  axes=[2, 3])

        return out
Example #15
0
    def forward(self):
        """Build the skipgram model.
        """
        initrange = 1.0 / self.config['embed_dim']
        embed_init = fluid.initializer.UniformInitializer(low=-initrange,
                                                          high=initrange)
        weight_init = fluid.initializer.TruncatedNormal(
            scale=1.0 / math.sqrt(self.config['embed_dim']))

        embed_src = fl.embedding(
            input=self.train_inputs,
            size=[self.num_nodes, self.config['embed_dim']],
            param_attr=fluid.ParamAttr(name='content', initializer=embed_init))

        weight_pos = fl.embedding(
            input=self.train_labels,
            size=[self.num_nodes, self.config['embed_dim']],
            param_attr=fluid.ParamAttr(name='weight', initializer=weight_init))

        weight_negs = fl.embedding(
            input=self.train_negs,
            size=[self.num_nodes, self.config['embed_dim']],
            param_attr=fluid.ParamAttr(name='weight', initializer=weight_init))

        pos_logits = fl.matmul(embed_src, weight_pos,
                               transpose_y=True)  # [batch_size, 1, 1]

        pos_score = fl.squeeze(pos_logits, axes=[1])
        pos_score = fl.clip(pos_score, min=-10, max=10)
        pos_score = -self.neg_num * fl.logsigmoid(pos_score)

        neg_logits = fl.matmul(embed_src, weight_negs,
                               transpose_y=True)  # [batch_size, 1, neg_num]
        neg_score = fl.squeeze(neg_logits, axes=[1])
        neg_score = fl.clip(neg_score, min=-10, max=10)
        neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score)
        neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True)

        self.loss = fl.reduce_mean(pos_score + neg_score) / self.neg_num / 2
Example #16
0
 def _dygraph_clip_by_value(self, params_grads):
     params_and_grads = []
     for p, g in params_grads:
         if g is None:
             continue
         if self._need_clip_func is not None and not self._need_clip_func(
                 p):
             params_and_grads.append((p, g))
             continue
         new_grad = layers.clip(x=g,
                                min=-self.clip_value,
                                max=self.clip_value)
         params_and_grads.append((p, new_grad))
     return params_and_grads
Example #17
0
def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)  # [N]
    area2 = box_area(boxes2)  # [M]
    N, M = boxes1.shape[0], boxes2.shape[0]
    boxes1 = L.unsqueeze(boxes1, axes=[1])  # [N, 1, 4]
    boxes1 = L.expand(boxes1, [1, M, 1])  # [N, M, 4]
    boxes2 = L.unsqueeze(boxes2, axes=[0])  # [1, M, 4]
    boxes2 = L.expand(boxes2, [N, 1, 1])  # [N, M, 4]
    lt = L.elementwise_max(boxes1[:, :, :2], boxes2[:, :, :2])  # [N, M, 2]
    rb = L.elementwise_min(boxes1[:, :, 2:], boxes2[:, :, 2:])  # [N, M, 2]

    wh = L.clip(rb - lt, min=0, max=1e8)  # [N, M, 2]
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N, M]

    area1 = L.expand(L.unsqueeze(area1, [1]), [1, M])  # [N, M]
    area2 = L.expand(L.unsqueeze(area2, [0]), [N, 1])  # [N, M]
    union = area1 + area2 - inter

    iou = inter / union
    return iou, union
Example #18
0
    def compute_softmax_loss(self, y, t):
        """compute the loss where output distribution is a categorial distribution.

        Args:
            y (Variable): shape(B, T, C_output), dtype float32, the logits of the output distribution.
            t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.

        Returns:
            Variable: shape(1, ), dtype float32, the loss.
        """
        # context size is not taken into account
        y = y[:, self.context_size:, :]
        t = t[:, self.context_size:]
        t = F.clip(t, min=-1.0, max=0.99999)
        quantized = quantize(t, n_bands=self.output_dim)
        label = F.unsqueeze(quantized, axes=[-1])

        loss = F.softmax_with_cross_entropy(y, label)
        reduced_loss = F.reduce_mean(loss)
        return reduced_loss
def generate_relative_positions_matrix(length,
                                       max_relative_position,
                                       cache=False):
    if not cache:
        range_vec = layers.range(0, length, 1, 'int32')
        range_vec.stop_gradient = True
        shapes = layers.shape(range_vec)
        range_vec = layers.reshape(range_vec, shape=[1, shapes[0]])
        range_mat = layers.expand(range_vec, [shapes[0], 1])
        distance_mat = range_mat - layers.transpose(range_mat, [1, 0])
    else:
        distance_mat = layers.range(-1 * length + 1, 1, 1, 'int32')
        distance_mat.stop_gradient = True
        shapes = layers.shape(distance_mat)
        distance_mat = layers.reshape(distance_mat, [1, shapes[0]])

    distance_mat_clipped = layers.clip(
        layers.cast(distance_mat, dtype="float32"),
        float(-max_relative_position), float(max_relative_position))
    final_mat = layers.cast(distance_mat_clipped,
                            dtype='int32') + max_relative_position
    return final_mat
Example #20
0
    def compute_mog_loss(self, y, t):
        """compute the loss where output distribution is a mixture of Gaussians.

        Args:
            y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.
            t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.

        Returns:
            Variable: shape(1, ), dtype float32, the loss.
        """
        n_mixture = self.output_dim // 3

        # context size is not taken in to account
        y = y[:, self.context_size:, :]
        t = t[:, self.context_size:]

        w, mu, log_std = F.split(y, 3, dim=2)
        # 100.0 is just a large float
        log_std = F.clip(log_std, min=self.log_scale_min, max=100.)
        inv_std = F.exp(-log_std)
        p_mixture = F.softmax(w, axis=-1)

        t = F.unsqueeze(t, axes=[-1])
        if n_mixture > 1:
            # t = F.expand_as(t, log_std)
            t = F.expand(t, [1, 1, n_mixture])

        x_std = inv_std * (t - mu)
        exponent = F.exp(-0.5 * x_std * x_std)
        pdf_x = 1.0 / math.sqrt(2.0 * math.pi) * inv_std * exponent

        pdf_x = p_mixture * pdf_x
        # pdf_x: [bs, len]
        pdf_x = F.reduce_sum(pdf_x, dim=-1)
        per_sample_loss = -F.log(pdf_x + 1e-9)

        loss = F.reduce_mean(per_sample_loss)
        return loss
Example #21
0
def clip_rho(net, vmin=0, vmax=1):

    for name, param in net.named_parameters():
        if 'rho' in name:
            param.set_value(clip(param, vmin, vmax))
Example #22
0
 def __call__(self, net):
     for name, param in net.named_parameters():
         if 'rho' in name:
             param.set_value(clip(param, self.vmin, self.vmax))
Example #23
0
    def forward(self):
        """Build the GATNE net.
        """
        param_attr_init = fluid.initializer.Uniform(
            low=-1.0, high=1.0, seed=np.random.randint(100))
        embed_param_attrs = fluid.ParamAttr(name='Base_node_embed',
                                            initializer=param_attr_init)

        # node_embeddings
        base_node_embed = fl.embedding(
            input=fl.reshape(self.train_inputs, shape=[-1, 1]),
            size=[self.num_nodes, self.embedding_size],
            param_attr=embed_param_attrs)

        node_features = []
        for edge_type in self.edge_types:
            param_attr_init = fluid.initializer.Uniform(
                low=-1.0, high=1.0, seed=np.random.randint(100))
            embed_param_attrs = fluid.ParamAttr(name='%s_node_embed' %
                                                edge_type,
                                                initializer=param_attr_init)

            features = fl.embedding(
                input=self.gw[edge_type].node_feat['index'],
                size=[self.num_nodes, self.embedding_u_size],
                param_attr=embed_param_attrs)

            node_features.append(features)

        # mp_output: list of embedding(self.num_nodes, dim)
        mp_output = self.message_passing(self.gw, self.edge_types,
                                         node_features)

        # U : (num_type[m], num_nodes, dim[s])
        node_type_embed = fl.stack(mp_output, axis=0)

        # U : (num_nodes, num_type[m], dim[s])
        node_type_embed = fl.transpose(node_type_embed, perm=[1, 0, 2])

        #gather node_type_embed from train_inputs
        node_type_embed = fl.gather(node_type_embed, self.train_inputs)

        # M_r
        trans_weights = fl.create_parameter(
            shape=[
                self.edge_type_count, self.embedding_u_size,
                self.embedding_size // self.att_head
            ],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w')

        # W_r
        trans_weights_s1 = fl.create_parameter(
            shape=[self.edge_type_count, self.embedding_u_size, self.dim_a],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s1')

        # w_r
        trans_weights_s2 = fl.create_parameter(
            shape=[self.edge_type_count, self.dim_a, self.att_head],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s2')

        trans_w = fl.gather(trans_weights, self.train_types)
        trans_w_s1 = fl.gather(trans_weights_s1, self.train_types)
        trans_w_s2 = fl.gather(trans_weights_s2, self.train_types)

        attention = self.attention(node_type_embed, trans_w_s1, trans_w_s2)
        node_type_embed = fl.matmul(attention, node_type_embed)
        node_embed = base_node_embed + fl.reshape(
            fl.matmul(node_type_embed, trans_w), [-1, self.embedding_size])

        self.last_node_embed = fl.l2_normalize(node_embed, axis=1)

        nce_weight_initializer = fluid.initializer.TruncatedNormalInitializer(
            loc=0.0, scale=1.0 / math.sqrt(self.embedding_size))
        nce_weight_attrs = fluid.ParamAttr(name='nce_weight',
                                           initializer=nce_weight_initializer)

        weight_pos = fl.embedding(input=self.train_labels,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        weight_neg = fl.embedding(input=self.train_negs,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        tmp_node_embed = fl.unsqueeze(self.last_node_embed, axes=[1])
        pos_logits = fl.matmul(tmp_node_embed, weight_pos,
                               transpose_y=True)  # [B, 1, 1]

        neg_logits = fl.matmul(tmp_node_embed, weight_neg,
                               transpose_y=True)  # [B, 1, neg_num]

        pos_score = fl.squeeze(pos_logits, axes=[1])
        pos_score = fl.clip(pos_score, min=-10, max=10)
        pos_score = -1.0 * fl.logsigmoid(pos_score)

        neg_score = fl.squeeze(neg_logits, axes=[1])
        neg_score = fl.clip(neg_score, min=-10, max=10)
        neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score)

        neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True)
        self.loss = fl.reduce_mean(pos_score + neg_score)
    def _get_bboxes_single(self,
                           cls_scores,
                           bbox_preds,
                           mlvl_points,
                           img_shape,
                           scale_factor,
                           rescale=False,
                           with_nms=True):
        # mlvl_points 里面每个元素是[格子行数*格子列数, 3]  具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长)
        nms_cfg = self.nms_cfg
        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
        mlvl_bboxes = []
        mlvl_scores = []
        # 遍历每个fpn输出层
        for i_lvl, (cls_score, bbox_pred, points) in enumerate(
                zip(cls_scores, bbox_preds, mlvl_points)):
            # cls_score.shape = [80, h, w]
            # bbox_pred.shape = [ 4, h, w]
            # points.shape    = [h*w, 3]   具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长)
            cls_score = L.transpose(cls_score, [1, 2, 0])              # [h, w, 80]
            cls_score = L.reshape(cls_score, (-1, self.num_classes))   # [h*w, 80]
            if self.use_sigmoid_cls:
                scores = L.sigmoid(cls_score)   # [h*w, 80]
            else:
                scores = L.softmax(cls_score)
            bbox_pred = L.transpose(bbox_pred, [1, 2, 0])   # [h, w, 4]
            bbox_pred = L.reshape(bbox_pred, (-1, 4))       # [h*w, 4]
            nms_top_k = nms_cfg.get('nms_top_k', -1)
            if nms_top_k > 0 and scores.shape[0] > nms_top_k:
                if self.use_sigmoid_cls:
                    max_scores = L.reduce_max(scores, dim=1)
                else:
                    # remind that we set FG labels to [0, num_class-1]
                    # since mmdet v2.0
                    # BG cat_id: num_class
                    # max_scores, _ = scores[:, :-1].max(dim=1)
                    pass
                _, topk_inds = L.topk(max_scores, k=nms_top_k)
                scores = L.gather(scores, topk_inds)  # [M, 80]
                points = L.gather(points, topk_inds)  # [M, 3]   格子xy坐标、边长
                bbox_pred = L.gather(bbox_pred, topk_inds)  # [M, 4]

            # [M, 4]  格子xy坐标重复2次。格子左上角坐标。
            bbox_pos_center = L.concat([points[:, :2], points[:, :2]], axis=1)

            # [M, 4]  物体最终预测坐标(x1y1x2y2格式) = bbox_pred*格子边长 + 格子左上角坐标
            bboxes = bbox_pred * self.fpn_stride[i_lvl] + bbox_pos_center

            x1 = L.clip(bboxes[:, 0], 0.0, img_shape[1])
            y1 = L.clip(bboxes[:, 1], 0.0, img_shape[0])
            x2 = L.clip(bboxes[:, 2], 0.0, img_shape[1])
            y2 = L.clip(bboxes[:, 3], 0.0, img_shape[0])
            bboxes = paddle.stack([x1, y1, x2, y2], axis=-1)  # [M, 4]
            mlvl_bboxes.append(bboxes)
            mlvl_scores.append(scores)
        mlvl_scores = L.concat(mlvl_scores, axis=0)  # [M2, 80]  各个fpn层预测的分数汇合在一起
        mlvl_bboxes = L.concat(mlvl_bboxes, axis=0)  # [M2, 4]   各个fpn层预测的bbox(x1y1x2y2格式)汇合在一起
        if rescale:
            scale_factor_ = paddle.to_tensor(scale_factor)
            mlvl_bboxes /= scale_factor_  # [M2, 4]   预测的bbox(x1y1x2y2格式)

        pred_scores = L.unsqueeze(mlvl_scores, axes=0)  # [1, M2, 80]
        pred_boxes = L.unsqueeze(mlvl_bboxes, axes=0)   # [1, M2,  4],最终坐标
        pred_scores = L.transpose(pred_scores, perm=[0, 2, 1])  # [1, 80, M2],最终分数

        # nms
        pred = None
        i = 0
        nms_cfg = copy.deepcopy(self.nms_cfg)
        nms_type = nms_cfg.pop('nms_type')
        if nms_type == 'matrix_nms':
            pred = fluid.layers.matrix_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg)
        elif nms_type == 'multiclass_nms':
            pred = fluid.layers.multiclass_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg)
        return pred