def forward(self, x): batch_size = x.size()[0] h_x = x.size()[2] w_x = x.size()[3] count_h = self.tensor_size(x[:, :, 1:, :]) count_w = self.tensor_size(x[:, :, :, 1:]) h_tv = flow.pow((x[:, :, 1:, :] - x[:, :, :h_x - 1, :]), 2).sum() w_tv = flow.pow((x[:, :, :, 1:] - x[:, :, :, :w_x - 1]), 2).sum() return self.tv_loss_weight * 2 * (h_tv / count_h + w_tv / count_w) / batch_size
def forward(self, inputs, targets): """ Args: inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim). targets (torch.LongTensor): ground truth labels with shape (num_classes). """ n = inputs.size(0) # Compute pairwise distance, replace by the official when merged dist = flow.pow(inputs, 2).sum(dim=1).expand(n, n) dist = dist + flow.transpose(dist, dim0=1, dim1=0) temp1 = -2 * flow.matmul(inputs, flow.transpose(inputs, dim0=1, dim1=0)) dist = flow.add(dist, temp1) dist = flow.sqrt(flow.clamp(dist, min=1e-12)) # For each anchor, find the hardest positive and negative mask = targets.expand(n, n).eq( flow.transpose(targets.expand(n, n), dim0=1, dim1=0)) dist_ap, dist_an = [], [] y1 = flow.zeros((1, n), dtype=flow.float32).to("cuda") y2 = flow.Tensor(np.exp(100 * np.ones((1, n)))).to("cuda") for i in range(n): temp_dist = flow.slice(dist, [(i, i + 1, 1)]) temp_mask = flow.slice(mask, [(i, i + 1, 1)]) temp_mask_rev = flow.slice(1 - mask, [(i, i + 1, 1)]) dist_ap.append(temp_mask.where(temp_dist, y1).max().unsqueeze(0)) dist_an.append( temp_mask_rev.where(temp_dist, y2).min().unsqueeze(0)) dist_ap = flow.cat(dist_ap) dist_an = flow.cat(dist_an) # Compute ranking hinge loss y = flow.ones_like(dist_an) return self.ranking_loss(dist_an, dist_ap, y)
def gelu(x): """ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return (0.5 * x * (1.0 + flow.tanh( math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0)))))
def _pow(self, b): return flow.pow(self, b)
def forward(self, x): return flow.pow(x, 2.0)
def recognize(self, inputs, inputs_mask): cache = {"fronend": None, "encoder": None, "decoder": None, "lm": None} self.attn_weights = {} memory, memory_mask, _, enc_attn_weights = self.encode( inputs, inputs_mask) self.attn_weights["encoder"] = enc_attn_weights self.attn_weights["decoder"] = [] b, t, v = memory.size() beam_memory = (memory.unsqueeze(1).repeat( [1, self.beam_width, 1, 1]).view(b * self.beam_width, t, v)) beam_memory_mask = (memory_mask.unsqueeze(1).repeat( [1, self.beam_width, 1]).view(b * self.beam_width, t)) preds = (flow.ones( [b * self.beam_width, 1], dtype=flow.int64, device=memory.device) * BOS) scores = flow.tensor([0.0] + [-float("inf")] * (self.beam_width - 1), dtype=flow.float32) scores = scores.to(memory.device).repeat([b]).unsqueeze(1) ending_flag = flow.zeros_like(scores).to(dtype=flow.uint8) with flow.no_grad(): for _ in range(1, self.max_len + 1): preds, cache, scores, ending_flag = self.decode_step( preds, beam_memory, beam_memory_mask, cache, scores, ending_flag) # whether stop or not if ending_flag.sum() == b * self.beam_width: break scores = scores.view(b, self.beam_width) preds = preds.view(b, self.beam_width, -1) lengths = flow.sum(flow.ne(preds, EOS).float(), dim=-1) # length penalty if self.penalty: lp = flow.pow((self.lamda + lengths) / (self.lamda + 1), self.penalty) scores /= lp sorted_scores, offset_indices = flow.sort(scores, dim=-1, descending=True) base_indices = (flow.arange( b, dtype=flow.int64, device=offset_indices.device) * self.beam_width) base_indices = (base_indices.unsqueeze(1).repeat( [1, self.beam_width]).view(-1)) preds = preds.view(b * self.beam_width, -1) indices = offset_indices.view(-1) + base_indices # remove BOS sorted_preds = preds[indices].view(b, self.beam_width, -1) nbest_preds = sorted_preds[:, :min(self.beam_width, self.nbest), 1:] nbest_scores = sorted_scores[:, :min(self.beam_width, self.nbest)] return self.nbest_translate(nbest_preds), nbest_scores