Beispiel #1
0
    def __call__(self, batch):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list

        xs, ys = batch
        ys = list(ys)
        if len(xs) != len(ys):
            print("error uttr")
            print(xs[0])
            pass

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])
        ilens = torch.from_numpy(ilens).to(self.device)
        xs_pad = pad_list([torch.from_numpy(x).float() for x in xs],
                          0).to(self.device, dtype=self.dtype)

        ys_pad = pad_list([torch.from_numpy(y[2]) for y in ys],
                          self.ignore_id).long().to(self.device)

        return xs_pad, ilens, ys_pad
Beispiel #2
0
    def __call__(self, batch, device):
        """Transforms a batch and send it to a device

        :param list batch: The batch to transform
        :param torch.device device: The device to send to
        :return: a tuple xs_pad, ilens, ys_pad
        :rtype (torch.Tensor, torch.Tensor, torch.Tensor)
        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        xs_pad = pad_list([torch.from_numpy(x).float() for x in xs],
                          0).to(device)
        ilens = torch.from_numpy(ilens).to(device)
        ys_pad = pad_list([torch.from_numpy(y).long() for y in ys],
                          self.ignore_id).to(device)

        return xs_pad, ilens, ys_pad
    def __call__(self, batch, device):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple( list(torch.Tensor), list(torch.Tensor), torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs_list = batch[0][:self.num_encs]
        ys = batch[0][-1]

        # perform subsampling
        if np.sum(self.subsamping_factors) > self.num_encs:
            xs_list = [[x[::self.subsampling_factors[i], :] for x in xs_list[i]] for i in range(self.num_encs)]

        # get batch of lengths of input sequences
        ilens_list = [np.array([x.shape[0] for x in xs_list[i]]) for i in range(self.num_encs)]

        # perform padding and convert to tensor
        # currently only support real number
        xs_list_pad = [pad_list([torch.from_numpy(x).float() for x in xs_list[i]], 0).to(device, dtype=self.dtype) for i
                       in range(self.num_encs)]

        ilens_list = [torch.from_numpy(ilens_list[i]).to(device) for i in range(self.num_encs)]
        # NOTE: this is for multi-task learning (e.g., speech translation)
        ys_pad = pad_list([torch.from_numpy(np.array(y[0]) if isinstance(y, tuple) else y).long()
                           for y in ys], self.ignore_id).to(device)

        return xs_list_pad, ilens_list, ys_pad
Beispiel #4
0
    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        xs_pad = pad_list([torch.from_numpy(x).long() for x in xs],
                          self.pad).to(device)
        ilens = torch.from_numpy(ilens).to(device)
        ys_pad = pad_list([torch.from_numpy(y).long() for y in ys],
                          self.ignore_id).to(device)

        return xs_pad, ilens, ys_pad
Beispiel #5
0
    def __call__(self, batch):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        xs, ys = batch
        ys = list(ys)

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == 'c':
            xs_pad_real = pad_list(
                [torch.from_numpy(x.real).float() for x in xs],
                0).to(self.dtype).cuda(self.device, non_blocking=True)
            xs_pad_imag = pad_list(
                [torch.from_numpy(x.imag).float() for x in xs],
                0).to(self.dtype).cuda(self.device, non_blocking=True)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it E2E here
            # because torch.nn.DataParellel can't handle it.
            xs_pad = {'real': xs_pad_real, 'imag': xs_pad_imag}
        else:
            xs_pad = pad_list([torch.from_numpy(x).float() for x in xs],
                              0).to(self.dtype).cuda(self.device,
                                                     non_blocking=True)

        ilens = torch.from_numpy(ilens).cuda(self.device, non_blocking=True)
        # NOTE: this is for multi-task learning (e.g., speech translation)
        ys_pad = pad_list([
            torch.from_numpy(
                np.array(y[0]) if isinstance(y, tuple) else y).long()
            for y in ys
        ], self.ignore_id).cuda(self.device, non_blocking=True)
        if self.task == "asr":
            return xs_pad, ilens, ys_pad
        elif self.task == "st":
            ys_pad_asr = pad_list(
                [torch.from_numpy(np.array(y[1])).long() for y in ys],
                0).cuda(self.device, non_blocking=True)
            return xs_pad, ilens, ys_pad, ys_pad_asr
        else:
            raise ValueError('Support only asr and st data')
Beispiel #6
0
    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[:: self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == "c":
            xs_pad_real = pad_list(
                [torch.from_numpy(x.real).float() for x in xs], 0
            ).to(device, dtype=self.dtype)
            xs_pad_imag = pad_list(
                [torch.from_numpy(x.imag).float() for x in xs], 0
            ).to(device, dtype=self.dtype)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it E2E here
            # because torch.nn.DataParellel can't handle it.
            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
        else:
            xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(
                device, dtype=self.dtype
            )

        ilens = torch.from_numpy(ilens).to(device)
        # NOTE: this is for multi-output (e.g., speech translation)
        ys_pad = pad_list(
            [
                torch.from_numpy(
                    np.array(y[0][:]) if isinstance(y, tuple) else y
                ).long()
                for y in ys
            ],
            self.ignore_id,
        ).to(device)

        return xs_pad, ilens, ys_pad
Beispiel #7
0
    def __call__(self, batch, device, evaluation=False):
        """Transforms a batch and send it to a device

        :param list batch: The batch to transform
        :param torch.device device: The device to send to
        :return: a tuple xs_pad, ilens, ys_pad
        :rtype (torch.Tensor, torch.Tensor, torch.Tensor)
        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == 'c':
            xs_pad_real = pad_list(
                [torch.from_numpy(x.real).float() for x in xs], 0).to(device)
            xs_pad_imag = pad_list(
                [torch.from_numpy(x.imag).float() for x in xs], 0).to(device)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it E2E here
            # because torch.nn.DataParellel can't handle it.
            xs_pad = {'real': xs_pad_real, 'imag': xs_pad_imag}
        else:
            if self.use_specaug and not evaluation:
                xs_pad = pad_list(
                    [specaug(torch.from_numpy(x).float()) for x in xs],
                    0).to(device)
            else:
                xs_pad = pad_list([torch.from_numpy(x).float() for x in xs],
                                  0).to(device)

        ilens = torch.from_numpy(ilens).to(device)
        # NOTE: this is for multi-task learning (e.g., speech translation)
        ys_pad = pad_list([
            torch.from_numpy(
                np.array(y[0]) if isinstance(y, tuple) else y).long()
            for y in ys
        ], self.ignore_id).to(device)

        return xs_pad, ilens, ys_pad
Beispiel #8
0
    def __call__(self, batch, device):
        """Transform a batch and send it to a device.

        Args:
            batch (list(tuple(str, dict[str, dict[str, Any]]))): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor): Transformed batch.

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]
        # Convert zip object to list in python 3.x
        ys = list(ys)

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == 'c':
            xs_pad_real = pad_list(
                [torch.from_numpy(x.real).float() for x in xs], 0).to(device, dtype=self.dtype)
            xs_pad_imag = pad_list(
                [torch.from_numpy(x.imag).float() for x in xs], 0).to(device, dtype=self.dtype)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it to E2E here
            # because torch.nn.DataParallel can't handle it.
            xs_pad = {'real': xs_pad_real, 'imag': xs_pad_imag}
        else:
            xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(device, dtype=self.dtype)

        ilens = torch.from_numpy(ilens).to(device)
        # TODO(Xuankai): try to make this neat
        if not isinstance(ys[0], np.ndarray):
            ys_pad = [torch.from_numpy(y[0]).long() for y in ys] + [torch.from_numpy(y[1]).long() for y in ys]
            ys_pad = pad_list(ys_pad, self.ignore_id)
            ys_pad = ys_pad.view(2, -1, ys_pad.size(1)).transpose(0, 1).to(device)  # (num_spkrs, B, Tmax)
        else:
            ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], self.ignore_id).to(device)

        return xs_pad, ilens, ys_pad
Beispiel #9
0
    def __call__(self, batch, device=torch.device('cpu')):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        _, ys = batch[0]
        ys_asr = copy.deepcopy(ys)
        xs_pad, ilens, ys_pad = super().__call__(batch, device)
        if self.asr_task:
            ys_pad_asr = pad_list([
                torch.from_numpy(
                    np.insert(y[1][1], 0, y[1][0]) if isinstance(y[1], tuple
                                                                 ) else np.
                    array(y[1][:]) if isinstance(y, tuple) else y).long()
                for y in ys_asr
            ], self.ignore_id).to(device)
        else:
            ys_pad_asr = None

        return xs_pad, ilens, ys_pad, ys_pad_asr
 def forward(self, xs, labels=None):
     ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)
     xs = [to_device(self.slu, to_torch_tensor(xx).float()) for xx in xs]
     xs_pad = pad_list(xs, 0.0)
     embeddings = self.slu(xs_pad, ilens, None)
     outputs = self.classifier(embeddings, labels)
     return outputs
Beispiel #11
0
def test_train_acc():
    n_out = 7
    _eos = n_out - 1
    n_batch = 3
    label_length = numpy.array([4, 2, 3], dtype=numpy.int32)
    np_pred = numpy.random.rand(n_batch,
                                max(label_length) + 1,
                                n_out).astype(numpy.float32)
    # NOTE: 0 is only used for CTC, never appeared in attn target
    np_target = [
        numpy.random.randint(1, n_out - 1, size=ol, dtype=numpy.int32)
        for ol in label_length
    ]

    eos = numpy.array([_eos], 'i')
    ys_out = [F.concat([y, eos], axis=0) for y in np_target]

    # padding for ys with -1
    # pys: utt x olen
    # NOTE: -1 is default ignore index for chainer
    pad_ys_out = F.pad_sequence(ys_out, padding=-1)
    y_all = F.reshape(np_pred, (n_batch * (max(label_length) + 1), n_out))
    ch_acc = F.accuracy(y_all, F.concat(pad_ys_out, axis=0), ignore_label=-1)

    # NOTE: this index 0 is only for CTC not attn. so it can be ignored
    # unfortunately, torch cross_entropy does not accept out-of-bound ids
    th_ignore = 0
    th_pred = torch.from_numpy(y_all.data)
    th_ys = [torch.from_numpy(numpy.append(t, eos)).long() for t in np_target]
    th_target = pad_list(th_ys, th_ignore)
    th_acc = th_accuracy(th_pred, th_target, th_ignore)

    numpy.testing.assert_allclose(ch_acc.data, th_acc)
Beispiel #12
0
    def __call__(self, batch, device):
        # batch should be located in list
        assert len(batch) == 1
        inputs_and_targets = batch[0]

        # parse inputs and targets
        xs, ys, spembs, spcs = inputs_and_targets

        # get list of lengths (must be tensor for DataParallel)
        ilens = torch.from_numpy(np.array([x.shape[0]
                                           for x in xs])).long().to(device)
        olens = torch.from_numpy(np.array([y.shape[0]
                                           for y in ys])).long().to(device)

        # perform padding and conversion to tensor
        xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device)
        ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device)

        # make labels for stop prediction
        labels = ys.new_zeros(ys.size(0), ys.size(1))
        for i, l in enumerate(olens):
            labels[i, l - 1:] = 1.0

        # prepare dict
        new_batch = {
            "xs": xs,
            "ilens": ilens,
            "ys": ys,
            "labels": labels,
            "olens": olens,
        }

        # load second target
        if spcs is not None:
            spcs = pad_list([torch.from_numpy(spc).float() for spc in spcs],
                            0).to(device)
            new_batch["spcs"] = spcs

        # load speaker embedding
        if spembs is not None:
            spembs = torch.from_numpy(np.array(spembs)).float().to(device)
            new_batch["spembs"] = spembs

        return new_batch
Beispiel #13
0
 def loss_fn_asr(self, best_x):
     loss_nll = torch.nn.NLLLoss()
     asr_loss = []
     ys = [torch.tensor(y['yseq']).long() for x in best_x for y in x]
     ys_asr = pad_list([y for y in ys], -1).to(self.device)
     batch = int(ys_asr.size(0) / self.nbest)
     ys_asr = ys_asr.view(batch, self.nbest, -1)
     char_scores = torch.stack(best_x[0][0]['char_score'])
     score = char_scores.mean(0).view(-1)
     return score
Beispiel #14
0
    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys, ys_src = batch[0]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])
        ilens = torch.from_numpy(ilens).to(device)

        xs_pad = pad_list([torch.from_numpy(x).float() for x in xs],
                          0).to(device, dtype=self.dtype)

        ys_pad = pad_list(
            [torch.from_numpy(np.array(y, dtype=np.int64)) for y in ys],
            self.ignore_id,
        ).to(device)

        if self.use_source_text:
            ys_pad_src = pad_list(
                [
                    torch.from_numpy(np.array(y, dtype=np.int64))
                    for y in ys_src
                ],
                self.ignore_id,
            ).to(device)
        else:
            ys_pad_src = None

        return xs_pad, ilens, ys_pad, ys_pad_src
Beispiel #15
0
def test_attn_loss():
    n_out = 7
    _eos = n_out - 1
    n_batch = 3
    label_length = numpy.array([4, 2, 3], dtype=numpy.int32)
    np_pred = numpy.random.rand(n_batch,
                                max(label_length) + 1,
                                n_out).astype(numpy.float32)
    # NOTE: 0 is only used for CTC, never appeared in attn target
    np_target = [
        numpy.random.randint(1, n_out - 1, size=ol, dtype=numpy.int32)
        for ol in label_length
    ]

    eos = numpy.array([_eos], 'i')
    ys_out = [F.concat([y, eos], axis=0) for y in np_target]

    # padding for ys with -1
    # pys: utt x olen
    # NOTE: -1 is default ignore index for chainer
    pad_ys_out = F.pad_sequence(ys_out, padding=-1)
    y_all = F.reshape(np_pred, (n_batch * (max(label_length) + 1), n_out))
    ch_loss = F.softmax_cross_entropy(y_all, F.concat(pad_ys_out, axis=0))

    # NOTE: this index 0 is only for CTC not attn. so it can be ignored
    # unfortunately, torch cross_entropy does not accept out-of-bound ids
    th_ignore = 0
    th_pred = torch.from_numpy(y_all.data)
    th_target = pad_list([torch.from_numpy(t.data).long() for t in ys_out],
                         th_ignore)
    if LooseVersion(torch.__version__) < LooseVersion('1.0'):
        reduction_str = 'elementwise_mean'
    else:
        reduction_str = 'mean'
    th_loss = torch.nn.functional.cross_entropy(th_pred,
                                                th_target.view(-1),
                                                ignore_index=th_ignore,
                                                reduction=reduction_str)
    print(ch_loss)
    print(th_loss)

    # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size
    # while chainer's default setting does
    loss_data = float(th_loss)
    numpy.testing.assert_allclose(loss_data, ch_loss.data, 0.05)
Beispiel #16
0
def test_ctc_loss(in_length, out_length, use_warpctc):
    pytest.importorskip("torch")
    if use_warpctc:
        pytest.importorskip("warpctc_pytorch")
        import warpctc_pytorch

        torch_ctcloss = warpctc_pytorch.CTCLoss(size_average=True)
    else:
        if LooseVersion(torch.__version__) < LooseVersion("1.0"):
            pytest.skip("pytorch < 1.0 doesn't support CTCLoss")
        _ctcloss_sum = torch.nn.CTCLoss(reduction="sum")

        def torch_ctcloss(th_pred, th_target, th_ilen, th_olen):
            th_pred = th_pred.log_softmax(2)
            loss = _ctcloss_sum(th_pred, th_target, th_ilen, th_olen)
            # Batch-size average
            loss = loss / th_pred.size(1)
            return loss

    n_out = 7
    input_length = numpy.array(in_length, dtype=numpy.int32)
    label_length = numpy.array(out_length, dtype=numpy.int32)
    np_pred = [
        numpy.random.rand(il, n_out).astype(numpy.float32)
        for il in input_length
    ]
    np_target = [
        numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32)
        for ol in label_length
    ]

    # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr.py
    ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2)
    ch_target = F.pad_sequence(np_target, padding=-1)
    ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0,
                                                      input_length,
                                                      label_length).data

    th_pred = pad_list([torch.from_numpy(x) for x in np_pred],
                       0.0).transpose(0, 1)
    th_target = torch.from_numpy(numpy.concatenate(np_target))
    th_ilen = torch.from_numpy(input_length)
    th_olen = torch.from_numpy(label_length)
    th_loss = torch_ctcloss(th_pred, th_target, th_ilen, th_olen).numpy()
    numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
Beispiel #17
0
    def __call__(self, batch, device):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        _, ys = batch[0]
        xs_pad, ilens, ys_pad = super().__call__(batch, device)
        if self.asr_task:
            ys_pad_asr = pad_list(
                [torch.from_numpy(np.array(y[1])).long() for y in ys],
                self.ignore_id).to(device)
        else:
            ys_pad_asr = None

        return xs_pad, ilens, ys_pad, ys_pad_asr
Beispiel #18
0
    def random_sampler(self, hyps, xlens, xs, spembs):
        # convert hyps to xs, xlens to ylens, ys to xs
        # separate yseq from dictionary of nbest_hyps
        ys = hyps
        #for i, y_hat in enumerate(ys):
            #seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
            #seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
            #logging.info("prediction[%d]: " % i + seq_hat_text)
        xlens_tts = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(self.device)
        xlens_tts = sorted(xlens_tts, reverse=True)
        xs_tts = pad_list([y.long() for y in ys], 0).to(self.device)
        xs, xlens = mask_by_length_and_multiply(xs, xlens, 0, self.nbest)
        onelens = np.fromiter((1 for xx in spembs), dtype=np.int64)
        spembs, _ = mask_by_length_and_multiply(spembs.unsqueeze(1), torch.tensor(onelens), 0, self.nbest)
        spembs = spembs.squeeze(1)
        ylens_tts = torch.Tensor([ torch.max(xlens) for _ in range(len(xlens)) ]).type(xlens.dtype)
        ys_tts = xs
        labels = ys_tts.new_zeros(ys_tts.size(0), ys_tts.size(1))
        for i, l in enumerate(ylens_tts):
            labels[i, l - 1:] = 1.0

        return xs_tts, xlens_tts, ys_tts, labels, ylens_tts, spembs
Beispiel #19
0
    def asr_to_tts(self, hyps, xlens, xs):
        # convert hyps to xs, xlens to ylens, ys to xs
        # separate yseq from dictionary of nbest_hyps
        ys = [ torch.tensor(y['yseq']) for x in hyps for y in x]
        for i, y_hat in enumerate(ys):
            seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
            seq_hat_text = "".join(seq_hat).replace('<space>', ' ')
            logging.info("prediction[%d]: " % i + seq_hat_text)

        xlens_tts = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(self.device)
        xlens_tts = sorted(xlens_tts, reverse=True)
        xs_tts = pad_list([y.long() for y in ys], 0).to(self.device)
        reduced_best = len(hyps[0])
        logging.info("nbest is %d", reduced_best)
        xs, xlens = mask_by_length_and_multiply(xs, xlens, 0, reduced_best)
        ylens_tts = xlens
        ys_tts = xs
        labels = ys_tts.new_zeros(ys_tts.size(0), ys_tts.size(1))
        for i, l in enumerate(ylens_tts):
            labels[i, l - 1:] = 1.0

        return xs_tts, xlens_tts, ys_tts, labels, ylens_tts
Beispiel #20
0
    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        _, ys = batch[0]
        ys_asr = copy.deepcopy(ys)
        xs_pad, ilens, ys_pad = super().__call__(batch, device)
        if self.use_source_text:
            ys_pad_asr = pad_list(
                [torch.from_numpy(np.array(y[1])).long() for y in ys_asr],
                self.ignore_id,
            ).to(device)
        else:
            ys_pad_asr = None

        return xs_pad, ilens, ys_pad, ys_pad_asr
Beispiel #21
0
def test_ctc_loss(in_length, out_length, ctc_type):
    pytest.importorskip("torch")
    if ctc_type == "warpctc":
        pytest.importorskip("warpctc_pytorch")
        import warpctc_pytorch

        torch_ctcloss = warpctc_pytorch.CTCLoss(size_average=True)
    elif ctc_type == "builtin" or ctc_type == "cudnnctc":
        if LooseVersion(torch.__version__) < LooseVersion("1.0"):
            pytest.skip("pytorch < 1.0 doesn't support CTCLoss")
        _ctcloss_sum = torch.nn.CTCLoss(reduction="sum")

        def torch_ctcloss(th_pred, th_target, th_ilen, th_olen):
            th_pred = th_pred.log_softmax(2)
            loss = _ctcloss_sum(th_pred, th_target, th_ilen, th_olen)
            # Batch-size average
            loss = loss / th_pred.size(1)
            return loss

    elif ctc_type == "gtnctc":
        pytest.importorskip("gtn")
        from espnet.nets.pytorch_backend.gtn_ctc import GTNCTCLossFunction

        _ctcloss_sum = GTNCTCLossFunction.apply

        def torch_ctcloss(th_pred, th_target, th_ilen, th_olen):
            targets = [t.tolist() for t in th_target]
            log_probs = torch.nn.functional.log_softmax(th_pred, dim=2)
            loss = _ctcloss_sum(log_probs, targets, th_ilen, 0, "none")
            return loss

    n_out = 7
    input_length = numpy.array(in_length, dtype=numpy.int32)
    label_length = numpy.array(out_length, dtype=numpy.int32)
    np_pred = [
        numpy.random.rand(il, n_out).astype(numpy.float32)
        for il in input_length
    ]
    np_target = [
        numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32)
        for ol in label_length
    ]

    # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr.py
    ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2)
    ch_target = F.pad_sequence(np_target, padding=-1)
    ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0,
                                                      input_length,
                                                      label_length).data

    th_pred = pad_list([torch.from_numpy(x) for x in np_pred],
                       0.0).transpose(0, 1)
    if ctc_type == "gtnctc":
        # gtn implementation expects targets as list
        th_target = np_target
        # keep as B x T x H for gtn
        th_pred = th_pred.transpose(0, 1)
    else:
        th_target = torch.from_numpy(numpy.concatenate(np_target))
    th_ilen = torch.from_numpy(input_length)
    th_olen = torch.from_numpy(label_length)
    th_loss = torch_ctcloss(th_pred, th_target, th_ilen, th_olen).numpy()

    numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
Beispiel #22
0
    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]

        if len(xs) == 2:
            logging.info("input and target are different form by transform")
            xs_in = xs[0]
            xs_out = xs[1]
        else:
            xs_in = xs
            xs_out = xs

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs_in])
        if self.tnum > 0:
            xs_pad_in = pad_list(
                [torch.from_numpy(x[:-self.tnum]).float() for x in xs_in],
                0).to(device, dtype=self.dtype)

            xs_pad_out = pad_list([
                torch.stack([
                    torch.from_numpy(x[i + 1:-self.tnum + i + 1]).float() if
                    (-self.tnum + i + 1) != 0 else torch.from_numpy(
                        x[i + 1:]).float() for i in range(self.tnum)
                ],
                            dim=1) for x in xs_out
            ], 0).to(device, dtype=self.dtype)
        else:
            xs_pad_in = pad_list([torch.from_numpy(x).float() for x in xs_in],
                                 0).to(device, dtype=self.dtype)
            xs_pad_out = pad_list(
                [torch.from_numpy(x).float().unsqueeze(1) for x in xs_out],
                0).to(device, dtype=self.dtype)

        ilens = torch.from_numpy(ilens).to(device) - self.tnum
        # NOTE: this is for multi-output (e.g., speech translation)
        ys_pad = pad_list(
            [
                torch.from_numpy(
                    np.array(y[0][:]) if isinstance(y, tuple) else y).long()
                for y in ys
            ],
            self.ignore_id,
        ).to(device)

        return xs_pad_in, xs_pad_out, ilens, ys_pad
Beispiel #23
0
def recog(args):
    """Decode with the given args.

    Args:
        args (namespace): The program arguments.
    """
    set_deterministic_pytorch(args)
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    model.recog_args = args
    model.eval()

    # read rnnlm
    if args.rnnlm:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.layer,
                             rnnlm_args.unit))
        torch_load(args.rnnlm, rnnlm)
        rnnlm.eval()
    else:
        rnnlm = None

    if args.word_rnnlm:
        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
        word_dict = rnnlm_args.char_list_dict
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer,
                             rnnlm_args.unit))
        torch_load(args.word_rnnlm, word_rnnlm)
        word_rnnlm.eval()

        if rnnlm is not None:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.MultiLevelLM(word_rnnlm.predictor,
                                           rnnlm.predictor, word_dict,
                                           char_dict))
        else:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict,
                                              char_dict))

    # gpu
    if args.ngpu == 1:
        gpu_id = list(range(args.ngpu))
        logging.info('gpu id: ' + str(gpu_id))
        model.cuda()
        if rnnlm:
            rnnlm.cuda()

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']
    new_js = {}

    load_inputs_and_targets = LoadInputsAndTargets(
        mode='asr',
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={'train': False})

    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info('(%d/%d) decoding ' + name, idx, len(js.keys()))
                batch = [(name, js[name])]
                feat = load_inputs_and_targets(batch)[0][0]
                if args.streaming_mode == 'window':
                    logging.info(
                        'Using streaming recognizer with window size %d frames',
                        args.streaming_window)
                    se2e = WindowStreamingE2E(e2e=model,
                                              recog_args=args,
                                              rnnlm=rnnlm)
                    for i in range(0, feat.shape[0], args.streaming_window):
                        logging.info('Feeding frames %d - %d', i,
                                     i + args.streaming_window)
                        se2e.accept_input(feat[i:i + args.streaming_window])
                    logging.info('Running offline attention decoder')
                    se2e.decode_with_attention_offline()
                    logging.info('Offline attention decoder finished')
                    nbest_hyps = se2e.retrieve_recognition()
                elif args.streaming_mode == 'segment':
                    logging.info(
                        'Using streaming recognizer with threshold value %d',
                        args.streaming_min_blank_dur)
                    nbest_hyps = []
                    for n in range(args.nbest):
                        nbest_hyps.append({'yseq': [], 'score': 0.0})
                    se2e = SegmentStreamingE2E(e2e=model,
                                               recog_args=args,
                                               rnnlm=rnnlm)
                    r = np.prod(model.subsample)
                    for i in range(0, feat.shape[0], r):
                        hyps = se2e.accept_input(feat[i:i + r])
                        if hyps is not None:
                            text = ''.join([
                                train_args.char_list[int(x)]
                                for x in hyps[0]['yseq'][1:-1] if int(x) != -1
                            ])
                            text = text.replace(
                                '\u2581', ' ').strip()  # for SentencePiece
                            text = text.replace(model.space, ' ')
                            text = text.replace(model.blank, '')
                            logging.info(text)
                            for n in range(args.nbest):
                                nbest_hyps[n]['yseq'].extend(hyps[n]['yseq'])
                                nbest_hyps[n]['score'] += hyps[n]['score']
                else:
                    nbest_hyps = model.recognize(feat, args,
                                                 train_args.char_list, rnnlm)
                new_js[name] = add_results_to_json(js[name], nbest_hyps,
                                                   train_args.char_list)

    else:

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # sort data
        keys = list(js.keys())
        feat_lens = [js[key]['input'][0]['shape'][0] for key in keys]
        sorted_index = sorted(range(len(feat_lens)),
                              key=lambda i: -feat_lens[i])
        keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            for names in grouper(args.batchsize, keys, None):
                names = [name for name in names if name]
                batch = [(name, js[name]) for name in names]
                feats = load_inputs_and_targets(batch)[0]
                if train_args.slu_model:
                    xs = feats
                    ilens = np.fromiter((xx.shape[0] for xx in xs),
                                        dtype=np.int64)
                    xs = [
                        to_device(model,
                                  to_torch_tensor(xx).float()) for xx in xs
                    ]
                    xs_pad = pad_list(xs, 0.0)
                    embeddings = model(xs_pad, ilens, None).cpu().numpy()
                    for i in range(len(batch)):
                        new_js[batch[i][0]] = embeddings[i].tolist()
                else:
                    nbest_hyps = model.recognize_batch(feats,
                                                       args,
                                                       train_args.char_list,
                                                       rnnlm=rnnlm)

                    for i, nbest_hyp in enumerate(nbest_hyps):
                        name = names[i]
                        new_js[name] = add_results_to_json(
                            js[name], nbest_hyp, train_args.char_list)

    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            },
                       indent=4,
                       ensure_ascii=False,
                       sort_keys=True).encode('utf_8'))