Ejemplo n.º 1
0
 def pack_x(self, x, x_lens):
     if x_lens is None or self.ignore_masked:
         return x
     else:
         mask = get_mask1d(x_lens.to(
             x.device)).unsqueeze(-1).unsqueeze(-1) > 0
         x_sel = torch.masked_select(x, mask)
         x_sel = x_sel.view(mask.sum(), x.size(-1))
         return x_sel
Ejemplo n.º 2
0
 def apply_mask(_loss):
     mask = utils.get_mask1d(feat_lens,
                             mask_length=_loss.size(1))
     mask = mask / mask.sum()
     _loss = _loss * mask.unsqueeze(-1).unsqueeze(-1)
     height_x_chanels = _loss.size(2) * _loss.size(3)
     _loss = _loss.sum()
     # The mask broadcasts over dim 2&3, hence we need to manually normalize
     _loss_per_pix = _loss / height_x_chanels
     return _loss, _loss_per_pix
Ejemplo n.º 3
0
    def forward(self, vq_output, features_len, targets_len=None):
        vq_output = (vq_output.contiguous().view(vq_output.size(0),
                                                 vq_output.size(1),
                                                 -1).permute(0, 2, 1))
        vq_output = F.pad(vq_output, (0, self.pred.kernel_size[0] - 1, 0, 0))
        out_conv = self.pred(vq_output)
        mask = utils.get_mask1d(features_len, mask_length=out_conv.size(2))
        mask.unsqueeze_(1)

        avg_mask = mask / mask.sum()

        if self.time_reduce == "avg":
            return (out_conv * avg_mask).sum(dim=2)
        elif self.time_reduce == "max":
            return torch.where(out_conv == 1, mask,
                               out_conv.min()).max(dim=2)[0]
        else:
            raise NotImplementedError(
                "GlobalPredictor: not a valid reduction:" + self.time_reduce)
Ejemplo n.º 4
0
    def loss(self, features, targets, features_len=None, targets_len=None):
        # the features may be padded
        if features_len is None:
            assert targets_len is None
            assert features.shape[1] == targets.shape[1], (
                f"The lengths of the targets and the inputs should "
                f"be the same for a framewise prediction. "
                f"Currently: {targets.shape[1]} and {features.shape[1]} respectively."
            )
        else:
            assert (torch.all(features_len == targets_len)
                    and (features.shape[1] >= targets.shape[1]))
        lens = features_len

        if lens is None:
            lens = torch.full((features.shape[0], ),
                              fill_value=features.shape[1],
                              device=targets.device)

        hidden = self(self.input)
        feat_aligned_len = features.shape[1]
        hidden_aligned_len = hidden.shape[1]

        assert feat_aligned_len >= lens.max(), (
            f"Incompatible shapes for features, hidden, targets: "
            f"{(features.shape, hidden.shape, targets.shape)}")
        targets = targets.long()

        rate_factor = feat_aligned_len // hidden_aligned_len
        assert (feat_aligned_len % hidden_aligned_len) == 0, (
            "The hidden (captured) representation should evenly divide the "
            "features length")
        hidden = hidden.repeat_interleave(rate_factor, dim=1)
        assert lens.max() <= hidden.shape[1], (
            f" Incompatible shapes for lens, hidden.shape[1]: "
            f"{(lens.max(), hidden.shape[1])}")
        hidden = hidden[:, :targets.shape[1]].contiguous()

        pred_labels = utils.safe_squeeze(hidden.argmax(dim=3), 2)
        accs = (pred_labels == targets).float()

        losses = F.cross_entropy(utils.safe_squeeze(hidden,
                                                    2).permute(0, 2, 1),
                                 targets,
                                 reduction="none")

        mask = utils.get_mask1d(lens, mask_length=losses.size(1))
        mask = mask / mask.sum()

        if not self.ignore_padding:
            mask[:] = 1

        acc = (accs * mask).sum()
        loss = (losses * mask).sum()

        if logger.is_currently_logging():
            logger.log_mpl_figure(
                "framewise_debug",
                self.plot(features, F.softmax(hidden.detach(), dim=-1)))
        details = {"loss": loss, "acc": acc, "out_seq": pred_labels.detach()}
        return loss, details
Ejemplo n.º 5
0
    def loss(self, features, targets, features_len=None, targets_len=None):
        # the features may be padded
        if features_len is None:
            assert targets_len is None
            assert features.shape[1] == targets.shape[1], (
                f"The lengths of the targets and the inputs should "
                f"be the same for a framewise prediction. "
                f"Currently: {targets.shape[1]} and {features.shape[1]} respectively."
            )
        else:
            assert (torch.all(features_len == targets_len)
                    and (features.shape[1] >= targets.shape[1]))
        features_len = self.calculateFeatureLens(features, features_len)
        inputs_len, rate_factor = self.calculateInputLengths(
            self.input, features, features_len)
        feat_aligned_len = features.shape[1]
        assert feat_aligned_len >= features_len.max(), (
            f"Incompatible shapes for features, pred, targets: "
            f"{(features.shape, pred.shape, targets.shape)}")
        targets = targets.long()

        details = {}
        total_loss = 0
        for pred_name, pred in self(self.input, inputs_len).items():
            hidden_aligned_len = pred.shape[1]

            assert (feat_aligned_len % hidden_aligned_len) == 0, (
                "The hidden (captured) representation should evenly divide the "
                "features length")
            pred = pred.repeat_interleave(rate_factor, dim=1)
            assert features_len.max() <= pred.shape[1], (
                f" Incompatible shapes for features_len, pred.shape[1]: "
                f"{(features_len.max(), pred.shape[1])}")
            pred = pred[:, :targets.shape[1]].contiguous()

            pred_labels = utils.safe_squeeze(pred.argmax(dim=3), 2)
            accs = (pred_labels == targets).float()

            losses = F.cross_entropy(utils.safe_squeeze(pred,
                                                        2).permute(0, 2, 1),
                                     targets,
                                     reduction="none")

            mask = utils.get_mask1d(features_len.to(losses.device),
                                    mask_length=losses.size(1))
            mask = mask / mask.sum()

            if not self.ignore_padding:
                mask[:] = 1

            acc = (accs * mask).sum()
            loss = (losses * mask).sum()

            if logger.is_currently_logging():
                logger.log_mpl_figure(
                    "framewise_debug_" + pred_name,
                    self.plot(features, F.softmax(pred.detach(), dim=-1)))
            total_loss = total_loss + loss
            details.update({
                "loss_" + pred_name: loss,
                "acc_" + pred_name: acc,
                "out_seq_" + pred_name: pred_labels.detach()
            })
        return total_loss, details