def forward(self, z, condition=None): """Transform a random noise sampled from a standard Gaussian distribution into sample from the target distribution. And output the mean and log standard deviation of the output distribution. Args: z (Variable): shape(B, T), random noise sampled from a standard gaussian disribution. condition (Variable, optional): shape(B, F, T), dtype float, the upsampled condition. Defaults to None. Returns: (z, out_mu, out_log_std) z (Variable): shape(B, T), dtype float, transformed noise, it is the synthesized waveform. out_mu (Variable): shape(B, T), dtype float, means of the output distributions. out_log_std (Variable): shape(B, T), dtype float, log standard deviations of the output distributions. """ for i, flow in enumerate(self.flows): theta = flow(z, condition) # w, mu, log_std [0: T] w, mu, log_std = F.split(theta, 3, dim=-1) # (B, T, 1) for each mu = F.squeeze(mu, [-1]) #[0: T] log_std = F.squeeze(log_std, [-1]) #[0: T] z = z * F.exp(log_std) + mu #[0: T] if i == 0: out_mu = mu out_log_std = log_std else: out_mu = out_mu * F.exp(log_std) + mu out_log_std += log_std return z, out_mu, out_log_std
def _matrix_nms(bboxes, cate_labels, cate_scores, kernel='gaussian', sigma=2.0): """Matrix NMS for multi-class bboxes. Args: bboxes (Tensor): shape (n, 4) cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gaussian' sigma (float): std in gaussian method Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = len(cate_labels) if n_samples == 0: return [] # 计算一个n×n的IOU矩阵,两组矩形两两之间的IOU iou_matrix = jaccard(bboxes, bboxes) # shape: [n_samples, n_samples] iou_matrix = paddle.triu(iou_matrix, diagonal=1) # 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # shape: [n_samples, n_samples] # 第i行第j列表示的是第i个预测框和第j个预测框的类别id是否相同。我们抑制的是同类的预测框。 d = cate_labels_x - L.transpose(cate_labels_x, [1, 0]) d = L.pow(d, 2) # 同类处为0,非同类处>0。 tf中用 == 0比较无效,所以用 < 1 label_matrix = paddle.triu(L.cast(d < 1, 'float32'), diagonal=1) # shape: [n_samples, n_samples] # IoU compensation # 非同类的iou置为0,同类的iou保留。逐列取最大iou compensate_iou = L.reduce_max(iou_matrix * label_matrix, [0, ]) # shape: [n_samples, ] # compensate_iou第0行里的值a0(重复了n_samples次)表示第0个物体与 比它分高 的 同类物体的最高iou为a0, # compensate_iou第1行里的值a1(重复了n_samples次)表示第1个物体与 比它分高 的 同类物体的最高iou为a1,... # compensate_iou里每一列里的值依次代表第0个物体、第1个物体、...、第n_samples-1个物体与 比它自己分高 的 同类物体的最高iou。 compensate_iou = L.transpose(L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]), [1, 0]) # shape: [n_samples, n_samples] # IoU decay # 非同类的iou置为0,同类的iou保留。 # decay_iou第i行第j列表示的是第i个预测框和第j个预测框的iou,如果不是同类,该iou置0。且只取上三角部分。 decay_iou = iou_matrix * label_matrix # shape: [n_samples, n_samples] # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_sum(decay_matrix / compensate_matrix, [0, ]) elif kernel == 'linear': # 看第j列。(1_test_matrixnms.py里的例子,看第2列) # decay_iou 里第2列里的值为[0.9389, 0.9979, 0, 0]。第2个物体与比它分高的2个同类物体的iou是0.9389, 0.9979。 # compensate_iou里第2列里的值为[0, 0.9409, 0.9979, 0]。比第2个物体分高的2个同类物体 与 比它们自己分高 的 同类物体的最高iou 是0, 0.9409。 # decay_matrix 里第2列里的值为[0.0610, 0.0348, 485.28, 1]。取该列的最小值为0.0348(抑制掉第2个物体的是第1个物体)。其实后面2个值不用看,因为它们总是>=1。 # 总结:decay_matrix里第j列里的第i个值若为最小值,则抑制掉第j个物体的是第i个物体。 # 而且,表现为decay_iou尽可能大,decay_matrix才会尽可能小。 decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, [0, ]) else: raise NotImplementedError # 更新分数 cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def forward(self, audio, mel, audio_start, clip_kl=True): """Compute loss of Clarinet model. Args: audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform. mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here). audio_start (Variable): shape(B, ), dtype int64, audio starts positions. clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True. Returns: Dict(str, Variable) loss (Variable): shape(1, ), dtype flaot32, total loss. kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution. regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence. spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform. """ batch_size, audio_length = audio.shape # audio clip's length z = F.gaussian_random(audio.shape) condition = self.encoder(mel) # (B, C, T) condition_slice = crop(condition, audio_start, audio_length) x, s_means, s_scales = self.student(z, condition_slice) # all [0: T] s_means = s_means[:, 1:] # (B, T-1), time steps [1: T] s_scales = s_scales[:, 1:] # (B, T-1), time steps [1: T] s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.) # teacher outputs single gaussian y = self.teacher(x[:, :-1], condition_slice[:, :, 1:]) _, t_means, t_scales = F.split(y, 3, -1) # time steps [1: T] t_means = F.squeeze(t_means, [-1]) # (B, T-1), time steps [1: T] t_scales = F.squeeze(t_scales, [-1]) # (B, T-1), time steps [1: T] t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.) s_distribution = D.Normal(s_means, F.exp(s_clipped_scales)) t_distribution = D.Normal(t_means, F.exp(t_clipped_scales)) # kl divergence loss, so we only need to sample once? no MC kl = s_distribution.kl_divergence(t_distribution) if clip_kl: kl = F.clip(kl, -100., 10.) # context size dropped kl = F.reduce_mean(kl[:, self.teacher.context_size:]) # major diff here regularization = F.mse_loss(t_scales[:, self.teacher.context_size:], s_scales[:, self.teacher.context_size:]) # introduce information from real target spectrogram_frame_loss = F.mse_loss(self.stft.magnitude(audio), self.stft.magnitude(x)) loss = kl + self.lmd * regularization + spectrogram_frame_loss loss_dict = { "loss": loss, "kl_divergence": kl, "regularization": regularization, "stft_loss": spectrogram_frame_loss } return loss_dict
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None): """Matrix NMS for multi-class masks. Args: seg_masks (Tensor): shape (n, h, w) 0、1组成的掩码 cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gauss' sigma (float): std in gaussian method sum_masks (Tensor): shape (n, ) n个物体的面积 Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = L.shape(cate_labels)[0] # 物体数 seg_masks = L.reshape(seg_masks, (n_samples, -1)) # [n, h*w] # inter. inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True) # [n, n] 自己乘以自己的转置。两两之间的交集面积。 # union. sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1]) # [n, n] sum_masks重复了n行得到sum_masks_x # iou. iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix) rows = L.range(0, n_samples, 1, 'int32') cols = L.range(0, n_samples, 1, 'int32') rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1]) cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples]) tri_mask = L.cast(rows > cols, 'float32') iou_matrix = tri_mask * iou_matrix # [n, n] 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # [n, n] cate_labels重复了n行得到cate_labels_x label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = tri_mask * label_matrix # [n, n] 只取上三角部分 # IoU compensation compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0) compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]) # [n, n] compensate_iou = L.transpose(compensate_iou, [1, 0]) # [n, n] # IoU decay decay_iou = iou_matrix * label_matrix # # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0) elif kernel == 'linear': decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, dim=0) else: raise NotImplementedError # update the score. cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def points2bbox(self, pts, y_first=True): """点集转换成包围框. :param pts: the input points sets (fields), each points set (fields) is represented as 2n scalar. :param y_first: if y_first=True, the point set is represented as [y1, x1, y2, x2 ... yn, xn], otherwise the point set is represented as [x1, y1, x2, y2 ... xn, yn]. :return: each points set is converting to a bbox [x1, y1, x2, y2]. """ pts_reshape = L.reshape(pts, (pts.shape[0], -1, 2, pts.shape[2], pts.shape[3])) pts_y = pts_reshape[:, :, 0, :, :] if y_first else pts_reshape[:, :, 1, :, :] pts_x = pts_reshape[:, :, 1, :, :] if y_first else pts_reshape[:, :, 0, :, :] if self.transform_method == 'minmax': # bbox_left = pts_x.min(dim=1, keepdim=True)[0] # bbox_right = pts_x.max(dim=1, keepdim=True)[0] # bbox_up = pts_y.min(dim=1, keepdim=True)[0] # bbox_bottom = pts_y.max(dim=1, keepdim=True)[0] # bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom], # dim=1) pass elif self.transform_method == 'partial_minmax': # pts_y = pts_y[:, :4, ...] # pts_x = pts_x[:, :4, ...] # bbox_left = pts_x.min(dim=1, keepdim=True)[0] # bbox_right = pts_x.max(dim=1, keepdim=True)[0] # bbox_up = pts_y.min(dim=1, keepdim=True)[0] # bbox_bottom = pts_y.max(dim=1, keepdim=True)[0] # bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom], # dim=1) pass elif self.transform_method == 'moment': pts_y_mean = L.reduce_mean(pts_y, dim=1, keep_dim=True) pts_x_mean = L.reduce_mean(pts_x, dim=1, keep_dim=True) pts_y_std = paddle.std(pts_y - pts_y_mean, axis=1, keepdim=True) pts_x_std = paddle.std(pts_x - pts_x_mean, axis=1, keepdim=True) moment_transfer = (self.moment_transfer * self.moment_mul) + ( self.moment_transfer.detach() * (1 - self.moment_mul)) moment_width_transfer = moment_transfer[0] moment_height_transfer = moment_transfer[1] half_width = pts_x_std * L.exp(moment_width_transfer) half_height = pts_y_std * L.exp(moment_height_transfer) bbox = L.concat([ pts_x_mean - half_width, pts_y_mean - half_height, pts_x_mean + half_width, pts_y_mean + half_height ], axis=1) else: raise NotImplementedError return bbox
def test_exp(self): program = Program() with program_guard(program): input = layers.data(name="input", shape=[16], dtype="float32") out = layers.exp(input, name='exp') self.assertIsNotNone(out) print(str(program))
def sample_from_mog(self, y): """Sample from the output distribution where the output distribution is a mixture of Gaussians. Args: y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture. Returns: Variable: shape(B, T), waveform sampled from the output distribution. """ batch_size, time_steps, output_dim = y.shape n_mixture = output_dim // 3 w, mu, log_std = F.split(y, 3, dim=-1) reshaped_w = F.reshape(w, (batch_size * time_steps, n_mixture)) prob_ids = F.sampling_id(F.softmax(reshaped_w)) prob_ids = F.reshape(prob_ids, (batch_size, time_steps)) prob_ids = prob_ids.numpy() index = np.array([[[b, t, prob_ids[b, t]] for t in range(time_steps)] for b in range(batch_size)]).astype("int32") index_var = dg.to_variable(index) mu_ = F.gather_nd(mu, index_var) log_std_ = F.gather_nd(log_std, index_var) dist = D.Normal(mu_, F.exp(log_std_)) samples = dist.sample(shape=[]) samples = F.clip(samples, min=-1., max=1.) return samples
def decode(conv_output, anchors, stride, num_class, conf_thresh): conv_shape = P.shape(conv_output) batch_size = conv_shape[0] n_grid = conv_shape[1] anchor_per_scale = len(anchors) conv_output = P.reshape( conv_output, (batch_size, n_grid, n_grid, anchor_per_scale, 5 + num_class)) conv_raw_dxdy = conv_output[:, :, :, :, 0:2] conv_raw_dwdh = conv_output[:, :, :, :, 2:4] conv_raw_conf = conv_output[:, :, :, :, 4:5] conv_raw_prob = conv_output[:, :, :, :, 5:] rows = P.range(0, n_grid, 1, 'float32') cols = P.range(0, n_grid, 1, 'float32') rows = P.expand(P.reshape(rows, (1, -1, 1)), [n_grid, 1, 1]) cols = P.expand(P.reshape(cols, (-1, 1, 1)), [1, n_grid, 1]) offset = P.concat([rows, cols], axis=-1) offset = P.reshape(offset, (1, n_grid, n_grid, 1, 2)) offset = P.expand(offset, [batch_size, 1, 1, anchor_per_scale, 1]) pred_xy = (P.sigmoid(conv_raw_dxdy) + offset) * stride pred_wh = (P.exp(conv_raw_dwdh) * P.assign(anchors)) pred_xywh = P.concat([pred_xy, pred_wh], axis=-1) pred_conf = P.sigmoid(conv_raw_conf) pred_prob = P.sigmoid(conv_raw_prob) pred_xywh = P.reshape(pred_xywh, (batch_size, -1, 4)) # [-1, -1, 4] pred_conf = P.reshape(pred_conf, (batch_size, -1, 1)) # [-1, -1, 1] pred_prob = P.reshape(pred_prob, (batch_size, -1, num_class)) # [-1, -1, 80] return pred_xywh, pred_conf, pred_prob
def get_embedding(self, num_embeddings, embedding_dim, padding_idx=None): """ Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of "Attention Is All You Need". """ half_dim = embedding_dim // 2 emb = layers.log(float(10000)) / (half_dim - -1) emb = layers.exp(layers.arange( start=0, end=half_dim, dtype='float32') * -emb) # [num_embeddings, embedding_dim // 2] emb = layers.unsqueeze(layers.arange(-num_embeddings // 2, num_embeddings // 2, dtype='float32'), axis=1) *\ layers.unsqueeze(emb, axis=0) emb = layers.concat([layers.sin(emb), layers.cos(emb)], dim=1) # [num_embeddings, embedding_dim] if embedding_dim % 2 == 1: emb = layers.concat( [emb, layers.zeros(shape=(num_embeddings, 1))], dim=1) if padding_idx is not None: emb[paddings_idx, :] = 0 self.origin_shift = num_embeddings // 2 return emb
def sequence_softmax(x, beta=None): """Compute sequence softmax over paddle LodTensor This function compute softmax normalization along with the length of sequence. This function is an extention of :code:`L.sequence_softmax` which can only deal with LodTensor whose last dimension is 1. Args: x: The input variable which is a LodTensor. beta: Inverse Temperature Return: Output of sequence_softmax """ if beta is not None: x = x * beta x_max = L.sequence_pool(x, "max") x_max = L.sequence_expand_as(x_max, x) x = x - x_max exp_x = L.exp(x) sum_exp_x = L.sequence_pool(exp_x, "sum") sum_exp_x = L.sequence_expand_as(sum_exp_x, exp_x) return exp_x / sum_exp_x
def _sampling(self, z_mean, z_log_var): """reparameterization trick """ # by default, random_normal has mean=0 and std=1.0 epsilon = layers.gaussian_random_batch_size_like( self.tar, shape=[-1, self.latent_size]) epsilon.stop_gradient = True return z_mean + layers.exp(0.5 * z_log_var) * epsilon
def R2Penalty(fake_img, f): # gradient penalty fakes = fake_img fakes.stop_gradient = False fake_logit = f(fake) apply_loss_scaling = lambda x: x * layers.exp(x * np.log(2.0)) undo_loss_scaling = lambda x: x * layers.exp(-x * np.log(2.0)) fake_logit = apply_loss_scaling(layers.sum(fake_logit)) #grads = dygraph.grad(fake_logit, fakes,create_graph=True) grads = dygraph.grad(fake_logit, fakes, create_graph=False) fake_grads = layers.reshape(grads[0], (fakes.shape[0], -1)) fake_grads = undo_loss_scaling(fake_grads) r2_penalty = layers.reduce_sum( layers.elementwise_mul(fake_grads, fake_grads)) return r2_penalty
def chunk_softmax(logits, labels, topk=10): after_exp = L.exp(logits) out, _ = L.argsort(after_exp, axis=-1) denorm = L.reduce_sum(out[:, -topk:], dim=-1, keep_dim=True) probs = after_exp / denorm one_hot = F.one_hot(labels, depth=probs.shape[-1]) loss = -L.reduce_sum(one_hot * L.log(probs)) / logits.shape[0] return loss
def R1Penalty(real_img, f): # gradient penalty reals = real_img reals.stop_gradient = False #reals = real_img real_logit = f(reals) apply_loss_scaling = lambda x: x * layers.exp(x * np.log(2.0, dtype='float32')) undo_loss_scaling = lambda x: x * layers.exp(-x * np.log(2.0, dtype='float32')) real_logit = apply_loss_scaling(layers.sum(real_logit)) #grads = dygraph.grad(real_logit, reals, create_graph=True) grads = dygraph.grad(real_logit, reals, create_graph=False) real_grads = layers.reshape(grads[0], (reals.shape[0], -1)) real_grads = undo_loss_scaling(real_grads) r1_penalty = layers.reduce_sum( layers.elementwise_mul(real_grads, real_grads)) return r1_penalty
def loss_neg_log_of_pos(self, pos_score, neg_score_n, gama=5.0): """ pos_score: batch_size x 1 neg_score_n: batch_size x n """ # n x batch_size neg_score_n = L.transpose(neg_score_n, [1, 0]) # 1 x batch_size pos_score = L.reshape(pos_score, [1, -1]) exp_pos_score = L.exp(pos_score * gama) exp_neg_score_n = L.exp(neg_score_n * gama) # (n+1) x batch_size pos_neg_score = L.concat([exp_pos_score, exp_neg_score_n], axis=0) # 1 x batch_size exp_sum = L.reduce_sum(pos_neg_score, dim=0, keep_dim=True) # 1 x batch_size loss = -1.0 * L.log(exp_pos_score / exp_sum) # batch_size loss = L.reshape(loss, [-1, 1]) return loss
def forward(self, mu, logvar=None): """ Compute loss Args: mu (tensor): mean logvar (tensor): logarithm of variance """ if logvar is None: logvar = L.zeros_like(mu) return -0.5 * L.reduce_sum(1 + logvar - L.pow(mu, 2) - L.exp(logvar))
def log_sum_exp(x): """预测为背景的概率是(axx是神经网络的输出) p = e^(a00-max)/[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 取对数 lnp = a00-max-ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 移项 a00 = lnp + max + ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 如果真的是背景类,标记p=1,所以 a00 = max + ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 神经网络的输出要尽量接近等号右边,才能预测为背景类。 """ x_max = P.reduce_max(x) return P.log(P.reduce_sum(P.exp(x - x_max), 1)) + x_max
def decode(pred_txtytwth, priors, use_yolo_regressors: bool = False): """ 对神经网络预测的坐标tx、ty、tw、th进行解码。默认用的是SSD的解码方式 """ if use_yolo_regressors: # Decoded boxes in center-size notation boxes = P.concat([ pred_txtytwth[:, :2] + priors[:, :2], priors[:, 2:] * P.exp(pred_txtytwth[:, 2:]) ], 1) boxes = point_form(boxes) else: variances = [0.1, 0.2] boxes = P.concat([ priors[:, :2] + pred_txtytwth[:, :2] * variances[0] * priors[:, 2:], priors[:, 2:] * P.exp(pred_txtytwth[:, 2:] * variances[1]) ], 1) x1y1 = boxes[:, :2] - boxes[:, 2:] / 2 x2y2 = boxes[:, :2] + boxes[:, 2:] / 2 return P.concat([x1y1, x2y2], 1)
def compute_mog_loss(self, y, t): """compute the loss where output distribution is a mixture of Gaussians. Args: y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture. t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. Returns: Variable: shape(1, ), dtype float32, the loss. """ n_mixture = self.output_dim // 3 # context size is not taken in to account y = y[:, self.context_size:, :] t = t[:, self.context_size:] w, mu, log_std = F.split(y, 3, dim=2) # 100.0 is just a large float log_std = F.clip(log_std, min=self.log_scale_min, max=100.) inv_std = F.exp(-log_std) p_mixture = F.softmax(w, axis=-1) t = F.unsqueeze(t, axes=[-1]) if n_mixture > 1: # t = F.expand_as(t, log_std) t = F.expand(t, [1, 1, n_mixture]) x_std = inv_std * (t - mu) exponent = F.exp(-0.5 * x_std * x_std) pdf_x = 1.0 / math.sqrt(2.0 * math.pi) * inv_std * exponent pdf_x = p_mixture * pdf_x # pdf_x: [bs, len] pdf_x = F.reduce_sum(pdf_x, dim=-1) per_sample_loss = -F.log(pdf_x + 1e-9) loss = F.reduce_mean(per_sample_loss) return loss
def _decode(self, x, y, w, h, anchors, stride, scale_x_y, eps, is_gt=False): conv_shape = x.shape # (8, 13, 13, 3) batch_size = conv_shape[0] n_grid = conv_shape[1] anchor_per_scale = conv_shape[3] _x = L.unsqueeze(x, 4) _y = L.unsqueeze(y, 4) conv_raw_dxdy = L.concat([_x, _y], -1) # (8, 13, 13, 3, 2) _w = L.unsqueeze(w, 4) _h = L.unsqueeze(h, 4) conv_raw_dwdh = L.concat([_w, _h], -1) # (8, 13, 13, 3, 2) rows = L.range(0, n_grid, 1, 'float32') cols = L.range(0, n_grid, 1, 'float32') rows = L.expand(L.reshape(rows, (1, -1, 1)), [n_grid, 1, 1]) cols = L.expand(L.reshape(cols, (-1, 1, 1)), [1, n_grid, 1]) offset = L.concat([rows, cols], axis=-1) offset = L.reshape(offset, (1, n_grid, n_grid, 1, 2)) offset = L.expand(offset, [batch_size, 1, 1, anchor_per_scale, 1]) if is_gt: decode_xy = (conv_raw_dxdy + offset) / n_grid else: if (abs(scale_x_y - 1.0) < eps): decode_xy = L.sigmoid(conv_raw_dxdy) decode_xy = (decode_xy + offset) / n_grid else: # Grid Sensitive decode_xy = scale_x_y * L.sigmoid(conv_raw_dxdy) - 0.5 * ( scale_x_y - 1.0) decode_xy = (decode_xy + offset) / n_grid anchor_t = fluid.layers.assign(np.copy(anchors).astype(np.float32)) decode_wh = (L.exp(conv_raw_dwdh) * anchor_t) / (n_grid * stride) decode_xywh = L.concat([decode_xy, decode_wh], axis=-1) if is_gt: decode_xywh.stop_gradient = True return decode_xywh # (8, 13, 13, 3, 4)
def pairwise_hinge(self): """pairwise model""" poi_repr = L.split(self.poi_repr, 2, dim=0) pos_repr, neg_repr = poi_repr pos_pred = L.cos_sim(self.query_repr, pos_repr) neg_pred = L.cos_sim(self.query_repr, neg_repr) mode = 'hinge_loss' # log(1 + e-z), max(0, 1 - z) if 'hinge_loss' == mode: theta_z = L.relu(1 + neg_pred - pos_pred) elif 'logistic_loss' == mode: theta_z = L.log(1 + L.exp(neg_pred - pos_pred)) self.loss = L.reduce_mean(theta_z) pos_cnt = L.reduce_sum(L.cast(L.greater_than(pos_pred, neg_pred), dtype="float32")) neg_cnt = L.reduce_sum(L.cast(L.less_than(pos_pred, neg_pred), dtype="float32")) self.order = pos_cnt / (1e-5 + neg_cnt) self.metrics = [self.loss, self.order]
def forward(self, tenFirst, tenSecond, tenFeaturesFirst, tenFeaturesSecond, tenFlow): b, _, h, w = tenFlow.shape tenDifference = tenFirst - backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackward) tenDifference = L.pow(tenDifference, 2) tenDifference = L.reduce_sum(tenDifference, 1, True) # [b, 1, h, w] tenDifference = L.sqrt(tenDifference).detach() tenFeaturesFirst = self.moduleFeat(tenFeaturesFirst) tenMean = L.reshape(tenFlow, (b, 2, -1)) # [b, 2, h * w] tenMean = L.reduce_mean(tenMean, 2, True) # [b, 2, 1] tenMean = L.reshape(tenMean, (b, 2, 1, 1)) # [b, 2, 1, 1] tenMean = L.expand(tenMean, (1, 1, h, w)) # [b, 2, h, w] delta = tenFlow - tenMean diff = L.concat([tenDifference, delta, tenFeaturesFirst], 1) tenDist = self.moduleDist(self.moduleMain(diff)) tenDist = L.pow(tenDist, 2.0) * -1.0 tenDist = tenDist - L.reduce_max(tenDist, 1, True) tenDist = L.exp(tenDist) tenDivisor = L.reduce_sum(tenDist, 1, True) tenDivisor = L.reciprocal(tenDivisor) tenScaleX = L.unfold(x=tenFlow[:, 0:1, :, :], kernel_sizes=self.intUnfold, strides=1, paddings=int((self.intUnfold - 1) / 2)) # [b, c, h * w] tenScaleX = L.reshape(tenScaleX, (b, -1, h, w)) # [b, c, h, w] tenScaleX = self.moduleScaleX(tenDist * tenScaleX) * tenDivisor tenScaleY = L.unfold(x=tenFlow[:, 1:2, :, :], kernel_sizes=self.intUnfold, strides=1, paddings=int((self.intUnfold - 1) / 2)) # [b, c, h * w] tenScaleY = L.reshape(tenScaleY, (b, -1, h, w)) # [b, c, h, w] tenScaleY = self.moduleScaleY(tenDist * tenScaleY) * tenDivisor return L.concat([tenScaleX, tenScaleY], 1)
def decode(conv_output, anchors, stride, num_class, grid_offset): conv_shape = P.shape(conv_output) batch_size = conv_shape[0] output_size = conv_shape[1] anchor_per_scale = len(anchors) conv_output = P.reshape(conv_output, (batch_size, output_size, output_size, anchor_per_scale, 5 + num_class)) conv_raw_dxdy = conv_output[:, :, :, :, 0:2] conv_raw_dwdh = conv_output[:, :, :, :, 2:4] conv_raw_conf = conv_output[:, :, :, :, 4:5] conv_raw_prob = conv_output[:, :, :, :, 5:] pred_xy = (P.sigmoid(conv_raw_dxdy) + grid_offset) * stride anchor_t = fluid.layers.assign(np.copy(anchors).astype(np.float32)) pred_wh = (P.exp(conv_raw_dwdh) * anchor_t) * stride pred_xywh = P.concat([pred_xy, pred_wh], axis=-1) pred_conf = P.sigmoid(conv_raw_conf) pred_prob = P.sigmoid(conv_raw_prob) return P.concat([pred_xywh, pred_conf, pred_prob], axis=-1)
def elu(x, alpha): return layers.relu(x) + alpha * (layers.exp(-1 * layers.relu(-1 * x)) - 1)
def log_softmax(x): """ log softmax """ t1 = layers.exp(x) t1 = layers.reduce_sum(t1, dim=-1) t1 = layers.log(t1) return layers.elementwise_sub(x, t1, axis=0)
def __softmax(x, eps=1e-9): exp_out = layers.exp(x=x) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
def _bbox_transform(self, dcx, dcy, dw, dh, anchors, downsample_ratio, batch_size, is_gt, scale_x_y, eps): shape_fmp = dcx.shape # batch_size = shape_fmp[0] anchor_per_scale = shape_fmp[1] output_size = shape_fmp[2] rows = L.range(0, output_size, 1., dtype='float32') cols = L.range(0, output_size, 1., dtype='float32') rows = L.reshape(rows, (1, 1, 1, -1)) # [1, 1, 1, w] cols = L.reshape(cols, (1, 1, -1, 1)) # [1, 1, h, 1] rows = L.expand( rows, [batch_size, anchor_per_scale, output_size, 1]) # [b, 3, h, w] cols = L.expand( cols, [batch_size, anchor_per_scale, 1, output_size]) # [b, 3, h, w] if is_gt: cx = (dcx + rows) / output_size cy = (dcy + cols) / output_size else: dcx_sig = L.sigmoid(dcx) dcy_sig = L.sigmoid(dcy) if (abs(scale_x_y - 1.0) > eps): dcx_sig = scale_x_y * dcx_sig - 0.5 * (scale_x_y - 1) dcy_sig = scale_x_y * dcy_sig - 0.5 * (scale_x_y - 1) cx = (dcx_sig + rows) / output_size cy = (dcy_sig + cols) / output_size anchor_w_ = [anchors[i] for i in range(0, len(anchors)) if i % 2 == 0] anchor_w_np = np.array(anchor_w_) # anchor_w_ = paddle.to_tensor(anchor_w_np, place=paddle.CUDAPlace(0)) anchor_w_ = paddle.to_tensor(anchor_w_np) anchor_w = L.reshape(anchor_w_, (1, -1, 1, 1)) # [1, 3, 1, 1] anchor_w = L.expand( anchor_w, [batch_size, 1, output_size, output_size]) # [b, 3, h, w] anchor_h_ = [anchors[i] for i in range(0, len(anchors)) if i % 2 == 1] anchor_h_np = np.array(anchor_h_) # anchor_h_ = paddle.to_tensor(anchor_h_np, place=paddle.CUDAPlace(0)) anchor_h_ = paddle.to_tensor(anchor_h_np) anchor_h = L.reshape(anchor_h_, (1, -1, 1, 1)) # [1, 3, 1, 1] anchor_h = L.expand( anchor_h, [batch_size, 1, output_size, output_size]) # [b, 3, h, w] # e^tw e^th exp_dw = L.exp(dw) exp_dh = L.exp(dh) pw = (exp_dw * anchor_w) / (output_size * downsample_ratio) ph = (exp_dh * anchor_h) / (output_size * downsample_ratio) if is_gt: exp_dw.stop_gradient = True exp_dh.stop_gradient = True pw.stop_gradient = True ph.stop_gradient = True x1 = cx - 0.5 * pw y1 = cy - 0.5 * ph x2 = cx + 0.5 * pw y2 = cy + 0.5 * ph if is_gt: x1.stop_gradient = True y1.stop_gradient = True x2.stop_gradient = True y2.stop_gradient = True return x1, y1, x2, y2
def network(self, for_test=False): """ 定义train_model的网络结构 :return: """ if not for_test: before = fluid.data(name='before_train', shape=[-1, self.sent_len], dtype='int64') target = fluid.data(name='target_train', shape=[-1, self.sent_len], dtype='int64') after = fluid.data(name='after_train', shape=[-1, self.sent_len], dtype='int64') # 定义数据读取工具 reader = fluid.io.DataLoader.from_generator( feed_list=[before, target, after], capacity=64, iterable=True) # 前向传播 rnn_out, encode_hidden = self.forward(target) pred_before = self.sent_pred(target, dir='before', encode_hidden=encode_hidden, for_test=False) pred_after = self.sent_pred(target, dir='after', encode_hidden=encode_hidden, for_test=False) else: before = fluid.data(name='before_test', shape=[-1, self.sent_len], dtype='int64') target = fluid.data(name='target_test', shape=[-1, self.sent_len], dtype='int64') after = fluid.data(name='after_test', shape=[-1, self.sent_len], dtype='int64') # 定义数据读取工具 reader = fluid.io.DataLoader.from_generator( feed_list=[before, target, after], capacity=64, iterable=True) # 前向传播 rnn_out, encode_hidden = self.forward(target) pred_before = self.sent_pred(target, dir='before', encode_hidden=encode_hidden, for_test=True) pred_after = self.sent_pred(target, dir='after', encode_hidden=encode_hidden, for_test=True) # 将batch_size 置为1列,为什么不是0列?0列是num_layers. pred_before = layers.transpose(pred_before, perm=[0, 2, 1, 3]) pred_after = layers.transpose(pred_after, perm=[0, 2, 1, 3]) if not for_test: before_emb = self.embedding(before) after_emb = self.embedding(after) vocab_emb = self.embedding.parameters()[0] else: before_emb = self.test_embedding(before) after_emb = self.test_embedding(after) vocab_emb = self.test_embedding.parameters()[0] #loss_before = layers.cross_entropy(pred_before, before, soft_label=False) #loss_after = layers.cross_entropy(pred_after, after, soft_label=False) vocab_emb = layers.reshape( vocab_emb, shape=[1, 1, 1, vocab_emb.shape[0], vocab_emb.shape[1]]) new_shape = pred_before.shape[:-1] + (1, ) + pred_before.shape[-1:] pred_before = layers.reshape(pred_before, shape=new_shape) pred_after = layers.reshape(pred_after, shape=new_shape) prob_w_before = layers.reduce_sum(layers.elementwise_mul( pred_before, vocab_emb), dim=[0, 4]) prob_w_after = layers.reduce_sum(layers.elementwise_mul( pred_after, vocab_emb), dim=[0, 4]) prob_w_before = layers.reduce_sum(layers.exp(prob_w_before), dim=-1) prob_w_after = layers.reduce_sum(layers.exp(prob_w_after), dim=-1) new_shape = before_emb.shape[:-1] + (1, ) + before_emb.shape[-1:] before_emb = layers.reshape(before_emb, shape=new_shape) after_emb = layers.reshape(after_emb, shape=new_shape) pred_before = layers.reduce_sum(layers.elementwise_mul( pred_before, before_emb), dim=[0, 3, 4]) pred_after = layers.reduce_sum(layers.elementwise_mul( pred_after, after_emb), dim=[0, 3, 4]) prob_before = layers.elementwise_div(layers.exp(pred_before), prob_w_before + 1e-6) prob_after = layers.elementwise_div(layers.exp(pred_after), prob_w_after + 1e-6) loss = -layers.reduce_mean( (layers.log(prob_after) + layers.log(prob_before)) / 2.0) return loss, reader
def ohem_conf_loss(self, pred_allboxes_conf, batch_size, labels_neg_mask, labels_pos_mask, labels_pos_index, class_vectors, labels_pos_cid): batch_conf = P.reshape(pred_allboxes_conf, (-1, self.num_classes)) loss_c = log_sum_exp(batch_conf) - batch_conf[:, 0] loss_c = P.reshape(loss_c, (batch_size, -1)) # (batch_size, 19248) labels_neg_mask = P.concat(labels_neg_mask, axis=0) # (batch_size*19248, 1) labels_neg_mask = P.reshape(labels_neg_mask, (batch_size, -1)) # (batch_size, 19248) loss_c = labels_neg_mask * loss_c # 只留下负样本损失, (batch_size, 19248) sorted_loss_c, loss_idx = P.argsort(loss_c, axis=-1, descending=True) labels_pos_mask = P.concat(labels_pos_mask, axis=0) # (batch_size*19248, 1) labels_pos_mask = P.reshape(labels_pos_mask, (batch_size, -1)) # (batch_size, 19248) num_pos = P.cast(P.reduce_sum(labels_pos_mask, dim=1), 'int32') # (batch_size, ) num_neg = self.negpos_ratio * num_pos # (batch_size, ) neg_topk_mask = [] for idx in range(batch_size): desc = P.range(num_neg[idx], num_neg[idx] - P.shape(labels_pos_mask)[1], -1, 'int32') neg_topk_mask.append(desc) neg_topk_mask = P.concat(neg_topk_mask, axis=0) # (batch_size*19248, ) neg_topk_mask = P.reshape(neg_topk_mask, (batch_size, -1)) # (batch_size, 19248) neg_topk_mask = P.cast(neg_topk_mask > 0, 'float32') # (batch_size, 19248) sorted_loss_c = neg_topk_mask * sorted_loss_c selected_poss = [] selected_negs = [] selected_pos_class_vectors = [] selected_neg_class_vectors = [] for idx in range(batch_size): selected_neg_idx_idx = P.where(sorted_loss_c[idx] > 0) selected_neg_idx_idx.stop_gradient = True selected_neg_idx = P.gather(loss_idx[idx], selected_neg_idx_idx) selected_neg_idx.stop_gradient = True selected_neg = P.gather(pred_allboxes_conf[idx], selected_neg_idx) selected_neg.stop_gradient = True selected_negs.append(selected_neg) selected_pos = P.gather(pred_allboxes_conf[idx], labels_pos_index[idx]) selected_pos.stop_gradient = True selected_poss.append(selected_pos) zeros = P.fill_constant(shape=[ P.shape(selected_neg)[0], ], value=0, dtype='int32') zeros.stop_gradient = True selected_neg_class_vector = P.gather(class_vectors, zeros) selected_neg_class_vector.stop_gradient = True selected_neg_class_vectors.append(selected_neg_class_vector) labels_pos_cid.stop_gradient = True labels_pos_index[idx].stop_gradient = True selected_pos_cid = P.gather(labels_pos_cid[idx], labels_pos_index[idx]) selected_pos_cid.stop_gradient = True selected_pos_class_vector = P.gather(class_vectors, selected_pos_cid) selected_pos_class_vector.stop_gradient = True selected_pos_class_vectors.append(selected_pos_class_vector) selected_negs = P.concat(selected_negs, axis=0) # (?, 1+80) selected_poss = P.concat(selected_poss, axis=0) # (?, 1+80) pred_ = P.concat([selected_negs, selected_poss], axis=0) # (?, 1+80) selected_neg_class_vectors = P.concat(selected_neg_class_vectors, axis=0) # (?, 1+80) selected_pos_class_vectors = P.concat(selected_pos_class_vectors, axis=0) # (?, 1+80) labels_ = P.concat( [selected_neg_class_vectors, selected_pos_class_vectors], axis=0) # (?, 1+80) # softmax交叉熵 fenzi = P.exp(pred_) fenmu = P.reduce_sum(fenzi, dim=1, keep_dim=True) pred_prob = fenzi / P.expand_as(fenmu, target_tensor=fenzi) conf_loss = labels_ * (0 - P.log(pred_prob + 1e-9)) # 交叉熵,加了极小的常数防止nan conf_loss = P.reduce_sum(conf_loss) return conf_loss