def prepare_inputs(bs, idim, odim, maxin_len, maxout_len,
                   spk_embed_dim=None, spc_dim=None, device=torch.device('cpu')):
    ilens = np.sort(np.random.randint(1, maxin_len, bs))[::-1].tolist()
    olens = np.sort(np.random.randint(1, maxout_len, bs))[::-1].tolist()
    ilens = torch.LongTensor(ilens).to(device)
    olens = torch.LongTensor(olens).to(device)
    xs = [np.random.randint(0, idim, l) for l in ilens]
    ys = [np.random.randn(l, odim) for l in olens]
    xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device)
    ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device)
    labels = ys.new_zeros(ys.size(0), ys.size(1))
    for i, l in enumerate(olens):
        labels[i, l - 1:] = 1

    batch = {
        "xs": xs,
        "ilens": ilens,
        "ys": ys,
        "labels": labels,
        "olens": olens,
    }

    if spk_embed_dim is not None:
        spembs = torch.from_numpy(np.random.randn(bs, spk_embed_dim)).float().to(device)
        batch["spembs"] = spembs
    if spc_dim is not None:
        spcs = [np.random.randn(l, spc_dim) for l in olens]
        spcs = pad_list([torch.from_numpy(spc).float() for spc in spcs], 0).to(device)
        batch["spcs"] = spcs

    return batch
Example #2
0
    def calculate_all_attentions(self, hs_pad, hlen, ys_pad, strm_idx=0):
        """Calculate all of attentions

        :param torch.Tensor hs_pad: batch of padded hidden state sequences (B, Tmax, D)
        :param torch.Tensor hlen: batch of lengths of hidden state sequences (B)
        :param torch.Tensor ys_pad: batch of padded character id sequence tensor (B, Lmax)
        :param int strm_idx: stream index for parallel speaker attention in multi-speaker case
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        """
        # TODO(kan-bayashi): need to make more smart way
        ys = [y[y != self.ignore_id] for y in ys_pad]  # parse padded ys
        att_idx = min(strm_idx, len(self.att) - 1)

        # hlen should be list of integer
        hlen = list(map(int, hlen))

        self.loss = None
        # prepare input and output word sequences with sos/eos IDs
        eos = ys[0].new([self.eos])
        sos = ys[0].new([self.sos])
        ys_in = [torch.cat([sos, y], dim=0) for y in ys]
        ys_out = [torch.cat([y, eos], dim=0) for y in ys]

        # padding for ys with -1
        # pys: utt x olen
        ys_in_pad = pad_list(ys_in, self.eos)
        ys_out_pad = pad_list(ys_out, self.ignore_id)

        # get length info
        olength = ys_out_pad.size(1)

        # initialization
        c_list = [self.zero_state(hs_pad)]
        z_list = [self.zero_state(hs_pad)]
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(self.zero_state(hs_pad))
            z_list.append(self.zero_state(hs_pad))
        att_w = None
        att_ws = []
        self.att[att_idx].reset()  # reset pre-computation of h

        # pre-computation of embedding
        eys = self.dropout_emb(self.embed(ys_in_pad))  # utt x olen x zdim

        # loop for an output sequence
        for i in six.moves.range(olength):
            att_c, att_w = self.att[att_idx](hs_pad, hlen,
                                             self.dropout_dec[0](z_list[0]),
                                             att_w)
            ey = torch.cat((eys[:, i, :], att_c), dim=1)  # utt x (zdim + hdim)
            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list,
                                              c_list)
            att_ws.append(att_w)

        # convert to numpy array with the shape (B, Lmax, Tmax)
        att_ws = att_to_numpy(att_ws, self.att[att_idx])
        return att_ws
Example #3
0
def mask_uniform(ys_pad, mask_token, eos, ignore_id):
    """Replace random tokens with <mask> label and add <eos> label.

    The number of <mask> is chosen from a uniform distribution
    between one and the target sequence's length.
    :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
    :param int mask_token: index of <mask>
    :param int eos: index of <eos>
    :param int ignore_id: index of padding
    :return: padded tensor (B, Lmax)
    :rtype: torch.Tensor
    :return: padded tensor (B, Lmax)
    :rtype: torch.Tensor
    """
    from espnet.nets.pytorch_backend.nets_utils import pad_list

    ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
    ys_out = [y.new(y.size()).fill_(ignore_id) for y in ys]
    ys_in = [y.clone() for y in ys]
    for i in range(len(ys)):
        num_samples = numpy.random.randint(1, len(ys[i]) + 1)
        idx = numpy.random.choice(len(ys[i]), num_samples)

        ys_in[i][idx] = mask_token
        ys_out[i][idx] = ys[i][idx]

    return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
Example #4
0
def convert_batch(batch,
                  backend="pytorch",
                  is_cuda=False,
                  idim=2,
                  odim=2,
                  num_inputs=2):
    ilens_list = [
        np.array([x[1]["input"][idx]["shape"][0] for x in batch])
        for idx in range(num_inputs)
    ]
    olens = np.array([x[1]["output"][0]["shape"][0] for x in batch])
    xs_list = [[
        np.random.randn(ilen, idim).astype(np.float32)
        for ilen in ilens_list[idx]
    ] for idx in range(num_inputs)]
    ys = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens]
    is_pytorch = backend == "pytorch"
    if is_pytorch:
        xs_list = [
            pad_list([torch.from_numpy(x).float() for x in xs_list[idx]], 0)
            for idx in range(num_inputs)
        ]
        ilens_list = [
            torch.from_numpy(ilens_list[idx]).long()
            for idx in range(num_inputs)
        ]
        ys = pad_list([torch.from_numpy(y).long() for y in ys], -1)

        if is_cuda:
            xs_list = [xs_list[idx].cuda() for idx in range(num_inputs)]
            ilens_list = [ilens_list[idx].cuda() for idx in range(num_inputs)]
            ys = ys.cuda()

    return xs_list, ilens_list, ys
def prepare_inputs(
    idim, odim, ilens, olens, spk_embed_dim=None, device=torch.device("cpu")
):
    xs = [np.random.randint(0, idim, l) for l in ilens]
    ys = [np.random.randn(l, odim) for l in olens]
    ilens = torch.LongTensor(ilens).to(device)
    olens = torch.LongTensor(olens).to(device)
    xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device)
    ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device)
    labels = ys.new_zeros(ys.size(0), ys.size(1))
    for i, l in enumerate(olens):
        labels[i, l - 1 :] = 1
    batch = {
        "xs": xs,
        "ilens": ilens,
        "ys": ys,
        "labels": labels,
        "olens": olens,
    }

    if spk_embed_dim is not None:
        batch["spembs"] = torch.FloatTensor(
            np.random.randn(len(ilens), spk_embed_dim)
        ).to(device)

    return batch
Example #6
0
    def decoder_forward(self, hs_pad, hlens, ys_pad):
        ys = [y[y != self.ignore_id] for y in ys_pad]

        hlens = list(map(int, hlens))

        blank = ys[0].new([self.blank])

        ys_in = [torch.cat([blank, y], dim=0) for y in ys]
        ys_in_pad = pad_list(ys_in, self.blank)

        olength = ys_in_pad.size(1)

        z_list, c_list = self.zero_state(hs_pad)
        eys = self.dropout_embed(self.embed(ys_in_pad))

        z_all = []
        for i in six.moves.range(olength):
            y, (z_list, c_list) = self.rnn_forward(eys[:, i, :],
                                                   (z_list, c_list))
            z_all.append(y)
        h_dec = torch.stack(z_all, dim=1)

        h_enc = hs_pad.unsqueeze(2)
        h_dec = h_dec.unsqueeze(1)

        z = self.joint(h_enc, h_dec)
        y = pad_list(ys, self.blank).type(torch.int32)

        z_len = to_device(self, torch.IntTensor(hlens))
        y_len = to_device(self, torch.IntTensor([_y.size(0) for _y in ys]))

        return z, y, z_len, y_len
Example #7
0
def convert_batch(batch, backend="pytorch", is_cuda=False, idim=40, odim=5):
    ilens = np.array([x[1]['input'][0]['shape'][0] for x in batch])
    olens = np.array([x[1]['output'][0]['shape'][0] for x in batch])
    xs = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens]
    ys = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens]
    is_pytorch = backend == "pytorch"
    if is_pytorch:
        xs = pad_list([torch.from_numpy(x).float() for x in xs], 0)
        ilens = torch.from_numpy(ilens).long()
        ys = pad_list([torch.from_numpy(y).long() for y in ys], -1)

        if is_cuda:
            xs = xs.cuda()
            ilens = ilens.cuda()
            ys = ys.cuda()
    else:
        if is_cuda:
            xp = importlib.import_module('cupy')
            xs = [chainer.Variable(xp.array(x)) for x in xs]
            ys = [chainer.Variable(xp.array(y)) for y in ys]
            ilens = xp.array(ilens)
        else:
            xs = [chainer.Variable(x) for x in xs]
            ys = [chainer.Variable(y) for y in ys]

    return xs, ilens, ys
Example #8
0
def prepare_inputs(
    mode, ilens=[20, 15], olens_tgt=[4, 3], olens_src=[3, 2], is_cuda=False
):
    np.random.seed(1)
    assert len(ilens) == len(olens_tgt)
    xs = [np.random.randn(ilen, 40).astype(np.float32) for ilen in ilens]
    ys_tgt = [np.random.randint(1, 5, olen).astype(np.int32) for olen in olens_tgt]
    ys_src = [np.random.randint(1, 5, olen).astype(np.int32) for olen in olens_src]
    ilens = np.array([x.shape[0] for x in xs], dtype=np.int32)

    if mode == "chainer":
        raise NotImplementedError

    elif mode == "pytorch":
        ilens = torch.from_numpy(ilens).long()
        xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0)
        ys_pad_tgt = pad_list([torch.from_numpy(y).long() for y in ys_tgt], -1)
        ys_pad_src = pad_list([torch.from_numpy(y).long() for y in ys_src], -1)
        if is_cuda:
            xs_pad = xs_pad.cuda()
            ilens = ilens.cuda()
            ys_pad_tgt = ys_pad_tgt.cuda()
            ys_pad_src = ys_pad_src.cuda()

        return xs_pad, ilens, ys_pad_tgt, ys_pad_src
    else:
        raise ValueError("Invalid mode")
Example #9
0
def prepare_inputs(mode, ilens=[150, 100], olens=[4, 3], is_cuda=False):
    np.random.seed(1)
    assert len(ilens) == len(olens)
    xs = [np.random.randn(ilen, 40).astype(np.float32) for ilen in ilens]
    ys = [np.random.randint(1, 5, olen).astype(np.int32) for olen in olens]
    ilens = np.array([x.shape[0] for x in xs], dtype=np.int32)

    if mode == "chainer":
        if is_cuda:
            xp = importlib.import_module('cupy')
            xs = [chainer.Variable(xp.array(x)) for x in xs]
            ys = [chainer.Variable(xp.array(y)) for y in ys]
            ilens = xp.array(ilens)
        else:
            xs = [chainer.Variable(x) for x in xs]
            ys = [chainer.Variable(y) for y in ys]
        return xs, ilens, ys

    elif mode == "pytorch":
        ilens = torch.from_numpy(ilens).long()
        xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0)
        ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], -1)
        if is_cuda:
            xs_pad = xs_pad.cuda()
            ilens = ilens.cuda()
            ys_pad = ys_pad.cuda()

        return xs_pad, ilens, ys_pad
    else:
        raise ValueError("Invalid mode")
Example #10
0
def prepare_inputs(mode, num_encs=2, is_cuda=False):
    ilens_list = [[3, 2] for _ in range(num_encs)]
    olens = [2, 1]
    np.random.seed(1)
    assert len(ilens_list[0]) == len(ilens_list[1]) == len(olens)
    xs_list = [[np.random.randn(ilen, 2).astype(np.float32) for ilen in ilens]
               for ilens in ilens_list]
    ys = [np.random.randint(1, 2, olen).astype(np.int32) for olen in olens]
    ilens_list = [
        np.array([x.shape[0] for x in xs], dtype=np.int32) for xs in xs_list
    ]

    if mode == "pytorch":
        ilens_list = [torch.from_numpy(ilens).long() for ilens in ilens_list]
        xs_pad_list = [
            pad_list([torch.from_numpy(x).float() for x in xs], 0)
            for xs in xs_list
        ]
        ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], -1)
        if is_cuda:
            xs_pad_list = [xs_pad.cuda() for xs_pad in xs_pad_list]
            ilens_list = [ilens.cuda() for ilens in ilens_list]
            ys_pad = ys_pad.cuda()

        return xs_pad_list, ilens_list, ys_pad
    else:
        raise ValueError("Invalid mode")
Example #11
0
def convert_batch(batch, backend="pytorch", is_cuda=False, idim=40, odim=5):
    ilens = np.array([x[1]["input"][0]["shape"][0] for x in batch])
    olens_tgt = np.array([x[1]["output"][0]["shape"][0] for x in batch])
    olens_src = np.array([x[1]["output"][1]["shape"][0] for x in batch])
    xs = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens]
    ys_tgt = [
        np.random.randint(1, odim, olen).astype(np.int32) for olen in olens_tgt
    ]
    ys_src = [
        np.random.randint(1, odim, olen).astype(np.int32) for olen in olens_src
    ]
    is_pytorch = backend == "pytorch"
    if is_pytorch:
        xs = pad_list([torch.from_numpy(x).float() for x in xs], 0)
        ilens = torch.from_numpy(ilens).long()
        ys_tgt = pad_list([torch.from_numpy(y).long() for y in ys_tgt], -1)
        ys_src = pad_list([torch.from_numpy(y).long() for y in ys_src], -1)

        if is_cuda:
            xs = xs.cuda()
            ilens = ilens.cuda()
            ys_tgt = ys_tgt.cuda()
            ys_src = ys_src.cuda()
    else:
        raise NotImplementedError

    return xs, ilens, ys_tgt, ys_src
Example #12
0
 def add_sos_eos(self, ys_pad):
     from espnet.nets.pytorch_backend.nets_utils import pad_list
     eos = ys_pad.new([self.eos])
     sos = ys_pad.new([self.sos])
     ys = [y[y != self.ignore_id] for y in ys_pad]  # parse padded ys
     ys_in = [torch.cat([sos, y], dim=0) for y in ys]
     ys_out = [torch.cat([y, eos], dim=0) for y in ys]
     return pad_list(ys_in, self.eos), pad_list(ys_out, self.ignore_id)
Example #13
0
    def forward(self, hs_pad, hlens, ys_pad):
        """Decoder forward

        Args:
            hs_pad (torch.Tensor): batch of padded hidden state sequences (B, Tmax, D)
            hlens (torch.Tensor): batch of lengths of hidden state sequences (B)
            ys_pad (torch.Tensor): batch of padded character id sequence tensor (B, Lmax)

        Returns:
           loss (torch.Tensor): rnnt-att loss value
        """

        ys = [y[y != self.ignore_id] for y in ys_pad]

        hlens = list(map(int, hlens))

        blank = ys[0].new([self.blank])

        ys_in = [torch.cat([blank, y], dim=0) for y in ys]
        ys_in_pad = pad_list(ys_in, self.blank)

        olength = ys_in_pad.size(1)

        c_list = [self.zero_state(hs_pad)]
        z_list = [self.zero_state(hs_pad)]
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(self.zero_state(hs_pad))
            z_list.append(self.zero_state(hs_pad))

        att_w = None
        self.att[0].reset()

        eys = self.dropout_emb(self.embed(ys_in_pad))

        z_all = []
        for i in six.moves.range(olength):
            att_c, att_w = self.att[0](hs_pad, hlens, self.dropout_dec[0](z_list[0]), att_w)

            ey = torch.cat((eys[:, i, :], att_c), dim=1)
            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)

            z_all.append(self.dropout_dec[-1](z_list[-1]))

        h_dec = torch.stack(z_all, dim=1)

        h_enc = hs_pad.unsqueeze(2)
        h_dec = h_dec.unsqueeze(1)

        z = self.joint(h_enc, h_dec)
        y = pad_list(ys, self.blank).type(torch.int32)

        z_len = to_device(self, torch.IntTensor(hlens))
        y_len = to_device(self, torch.IntTensor([_y.size(0) for _y in ys]))

        loss = to_device(self, self.rnnt_loss(z, y, z_len, y_len))

        return loss
Example #14
0
    def forward(self, xs, ilens, ys, olens, spembs=None):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of the padded sequences of character ids (B, Tmax).
            ilens (Tensor): Batch of lengths of each input sequence (B,).
            ys (Tensor): Batch of the padded sequence of target features (B, Lmax, odim).
            olens (Tensor): Batch of lengths of each output sequence (B,).
            spembs (Tensor, optional): Batch of speaker embedding vectors (B, spk_embed_dim).

        Returns:
            Tensor: Batch of durations (B, Tmax).

        """
        att_ws = self._calculate_encoder_decoder_attentions(xs,
                                                            ilens,
                                                            ys,
                                                            olens,
                                                            spembs=spembs)
        # TODO(kan-bayashi): fix this issue
        # this does not work in multi-gpu case. registered buffer is not saved.
        if int(self.diag_head_idx) == -1:
            self._init_diagonal_head(att_ws)
        att_ws = att_ws[:, self.diag_head_idx]
        durations = [
            self._calculate_duration(att_w, ilen, olen)
            for att_w, ilen, olen in zip(att_ws, ilens, olens)
        ]

        return pad_list(durations, 0)
Example #15
0
    def _generate_pseudo_label(self, encoder_out, text, text_lengths):
        from espnet.nets.pytorch_backend.nets_utils import pad_list
        from itertools import groupby

        # sample from ASR model
        ys_hat = self.ctc.argmax(encoder_out).data

        # remove the blank and pad
        text_hat = []
        text_hat_lengths = []
        for y in ys_hat.cpu():
            y_hat = [x[0] for x in groupby(y)]
            y_hat_beta = []
            for idx in y_hat:
                idx = int(idx)
                if idx != self.ignore_id and idx != self.error_calculator.idx_blank:
                    y_hat_beta.append(int(idx))
            text_hat.append(torch.tensor(y_hat_beta).to(text.device)[:-1])
            text_hat_lengths.append(len(y_hat_beta) - 1)

        # make the list to tensor
        text_hat = pad_list(text_hat, self.ignore_id)
        text_hat_lengths = torch.tensor(text_hat_lengths).to(
            text_lengths.device)

        # replace the label and safe detach the tensor
        text = text_hat.detach()
        text_lengths = text_hat_lengths.detach()

        return text, text_lengths
Example #16
0
def test_pad_list():
    xs = [[1, 2, 3], [1, 2], [1, 2, 3, 4]]
    xs = list(map(lambda x: torch.LongTensor(x), xs))
    xpad = pad_list(xs, -1)

    es = [[1, 2, 3, -1], [1, 2, -1, -1], [1, 2, 3, 4]]
    assert xpad.data.tolist() == es
Example #17
0
    def recognize_batch(self, xs, recog_args, char_list, rnnlm=None):
        """E2E beam search.

        :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[:: self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)

        # 0. Frontend
        if self.frontend is not None:
            enhanced, hlens, mask = self.frontend(xs_pad, ilens)
            hs_pad, hlens = self.feature_transform(enhanced, hlens)
        else:
            hs_pad, hlens = xs_pad, ilens

        batchsize = hs_pad.size(0)

        # 1. Encoder
        hyps, hlens, _ = self.enc(hs_pad, hlens)
        hyps = hyps.view(batchsize, -1, self.odim)

        return hyps
Example #18
0
    def translate_batch(self, xs, trans_args, char_list, rnnlm=None):
        """E2E batch beam search.

        :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[::self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)

        # 1. Encoder
        hs_pad, hlens, _ = self.enc(xs_pad, ilens)

        # 2. Decoder
        hlens = torch.tensor(list(map(int,
                                      hlens)))  # make sure hlens is tensor
        y = self.dec.recognize_beam_batch(hs_pad, hlens, None, trans_args,
                                          char_list, rnnlm)

        if prev:
            self.train()
        return y
Example #19
0
    def translate_batch(self, xs, trans_args, char_list, rnnlm=None):
        """E2E batch beam search.

        :param list xs:
            list of input source text feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()

        # 1. Encoder
        if self.multilingual:
            ilens = np.fromiter((len(xx[1:]) for xx in xs), dtype=np.int64)
            hs = [to_device(self, torch.from_numpy(xx[1:])) for xx in xs]
        else:
            ilens = np.fromiter((len(xx) for xx in xs), dtype=np.int64)
            hs = [to_device(self, torch.from_numpy(xx)) for xx in xs]
        xpad = pad_list(hs, self.pad)
        hs_pad, hlens, _ = self.enc(self.dropout(self.embed(xpad)), ilens)

        # 2. Decoder
        hlens = torch.tensor(list(map(int,
                                      hlens)))  # make sure hlens is tensor
        y = self.dec.recognize_beam_batch(hs_pad, hlens, None, trans_args,
                                          char_list, rnnlm)

        if prev:
            self.train()
        return y
Example #20
0
def test_length_regulator():
    # prepare inputs
    idim = 5
    ilens = [10, 5, 3]
    xs = pad_list([torch.randn((ilen, idim)) for ilen in ilens], 0.0)
    ds = pad_list([torch.arange(ilen) for ilen in ilens], 0)

    # test with non-zero durations
    length_regulator = LengthRegulator()
    xs_expand = length_regulator(xs, ds, ilens)
    assert int(xs_expand.shape[1]) == int(ds.sum(dim=-1).max())

    # test with duration including zero
    ds[:, 2] = 0
    xs_expand = length_regulator(xs, ds, ilens)
    assert int(xs_expand.shape[1]) == int(ds.sum(dim=-1).max())
Example #21
0
    def forward_mt(self, xs_pad, ys_in_pad, ys_out_pad, ys_mask):
        """Forward pass in the auxiliary MT task.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ys_in_pad: batch of padded target sequences (B, Lmax)
        :param torch.Tensor ys_out_pad: batch of padded target sequences (B, Lmax)
        :param torch.Tensor ys_mask: batch of input token mask (B, Lmax)
        :return: MT loss value
        :rtype: torch.Tensor
        :return: accuracy in MT decoder
        :rtype: float
        """
        loss, acc = 0.0, None
        if self.mt_weight == 0:
            return loss, acc

        ilens = torch.sum(xs_pad != self.ignore_id, dim=1).cpu().numpy()
        # NOTE: xs_pad is padded with -1
        xs = [x[x != self.ignore_id] for x in xs_pad]  # parse padded xs
        xs_zero_pad = pad_list(xs, self.pad)  # re-pad with zero
        xs_zero_pad = xs_zero_pad[:, : max(ilens)]  # for data parallel
        src_mask = (
            make_non_pad_mask(ilens.tolist()).to(xs_zero_pad.device).unsqueeze(-2)
        )
        hs_pad, hs_mask = self.encoder_mt(xs_zero_pad, src_mask)
        pred_pad, _ = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)
        loss = self.criterion(pred_pad, ys_out_pad)
        acc = th_accuracy(
            pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
        )
        return loss, acc
Example #22
0
    def forward(self, xs, ds, alpha=1.0):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
            ds (LongTensor): Batch of durations of each frame (B, T).
            alpha (float, optional): Alpha value to control speed of speech.

        Returns:
            Tensor: replicated input tensor based on durations (B, T*, D).

        """
        if alpha != 1.0:
            assert alpha > 0
            ds = torch.round(ds.float() * alpha).long()

        if ds.sum() == 0:
            logging.warning("predicted durations includes all 0 sequences. "
                            "fill the first element with 1.")
            # NOTE(kan-bayashi): This case must not be happend in teacher forcing.
            #   It will be happened in inference with a bad duration predictor.
            #   So we do not need to care the padded sequence case here.
            ds[ds.sum(dim=1).eq(0)] = 1

        repeat = [torch.repeat_interleave(x, d, dim=0) for x, d in zip(xs, ds)]
        return pad_list(repeat, self.pad_value)
Example #23
0
    def enhance(self, xs):
        """Forward only the frontend stage.

        Args:
            xs (ndarray): input acoustic feature (T, C, F)

        Returns:
            enhanced (ndarray):
            mask (torch.Tensor):
            ilens (torch.Tensor): batch of lengths of input sequences (B)

        """
        if self.frontend is None:
            raise RuntimeError('Frontend does\'t exist')
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[::self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)
        enhanced, hlensm, mask = self.frontend(xs_pad, ilens)

        if prev:
            self.train()

        return enhanced.cpu().numpy(), mask.cpu().numpy(), ilens
Example #24
0
    def enhance(self, xs):
        """Forward only the frontend stage.

        :param ndarray xs: input acoustic feature (T, C, F)
        """
        if self.frontend is None:
            raise RuntimeError('Frontend doesn\'t exist')
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[::self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)
        enhanced, hlensm, mask = self.frontend(xs_pad, ilens)
        if prev:
            self.train()

        if isinstance(enhanced, (tuple, list)):
            enhanced = list(enhanced)
            mask = list(mask)
            for idx in range(len(enhanced)):  # number of speakers
                enhanced[idx] = enhanced[idx].cpu().numpy()
                mask[idx] = mask[idx].cpu().numpy()
            return enhanced, mask, ilens
        return enhanced.cpu().numpy(), mask.cpu().numpy(), ilens
Example #25
0
def common_collate_fn(
    data: Collection[Tuple[str, Dict[str, np.ndarray]]],
    float_pad_value: Union[float, int] = 0.0,
    int_pad_value: int = -32768,
    not_sequence: Collection[str] = (),
) -> Tuple[List[str], Dict[str, torch.Tensor]]:
    """Concatenate ndarray-list to an array and convert to torch.Tensor.

    Examples:
        >>> from espnet2.samplers.constant_batch_sampler import ConstantBatchSampler,
        >>> import espnet2.tasks.abs_task
        >>> from espnet2.train.dataset import ESPnetDataset
        >>> sampler = ConstantBatchSampler(...)
        >>> dataset = ESPnetDataset(...)
        >>> keys = next(iter(sampler)
        >>> batch = [dataset[key] for key in keys]
        >>> batch = common_collate_fn(batch)
        >>> model(**batch)

        Note that the dict-keys of batch are propagated from
        that of the dataset as they are.

    """
    assert check_argument_types()
    uttids = [u for u, _ in data]
    data = [d for _, d in data]

    assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching"
    assert all(not k.endswith("_lengths")
               for k in data[0]), f"*_lengths is reserved: {list(data[0])}"

    output = {}
    for key in data[0]:
        # NOTE(kamo):
        # Each models, which accepts these values finally, are responsible
        # to repaint the pad_value to the desired value for each tasks.
        if data[0][key].dtype.kind == "i":
            pad_value = int_pad_value
        else:
            pad_value = float_pad_value

        array_list = [d[key] for d in data]

        # Assume the first axis is length:
        # tensor_list: Batch x (Length, ...)
        tensor_list = [torch.from_numpy(a) for a in array_list]
        # tensor: (Batch, Length, ...)
        tensor = pad_list(tensor_list, pad_value)
        output[key] = tensor

        # lens: (Batch,)
        if key not in not_sequence:
            lens = torch.tensor([d[key].shape[0] for d in data],
                                dtype=torch.long)
            output[key + "_lengths"] = lens

    output = (uttids, output)
    assert check_return_type(output)
    return output
Example #26
0
def prepare_inputs(idim, odim, ilens, olens, is_cuda=False):
    np.random.seed(1)

    xs = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens]
    ys = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens]
    ilens = np.array([x.shape[0] for x in xs], dtype=np.int32)

    xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0)
    ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], -1)
    ilens = torch.from_numpy(ilens).long()

    if is_cuda:
        xs_pad = xs_pad.cuda()
        ys_pad = ys_pad.cuda()
        ilens = ilens.cuda()

    return xs_pad, ilens, ys_pad
    def forward(self, hs_pad, hlens, ys_pad):
        """Forward function for transducer.

        Args:
            hs_pad (torch.Tensor): batch of padded hidden state sequences (B, Tmax, D)
            hlens (torch.Tensor): batch of lengths of hidden state sequences (B)
            ys_pad (torch.Tensor): batch of padded character id sequence tensor (B, Lmax)

        Returns:
           loss (float): rnnt loss value

        """
        ys = [y[y != self.ignore_id] for y in ys_pad]

        hlens = list(map(int, hlens))

        blank = ys[0].new([self.blank])

        ys_in = [torch.cat([blank, y], dim=0) for y in ys]
        ys_in_pad = pad_list(ys_in, self.blank)

        olength = ys_in_pad.size(1)

        z_list, c_list = self.zero_state(hs_pad)
        eys = self.dropout_embed(self.embed(ys_in_pad))

        z_all = []
        for i in six.moves.range(olength):
            y, (z_list, c_list) = self.rnn_forward(eys[:, i, :],
                                                   (z_list, c_list))
            z_all.append(y)
        h_dec = torch.stack(z_all, dim=1)

        h_enc = hs_pad.unsqueeze(2)
        h_dec = h_dec.unsqueeze(1)

        z = self.joint(h_enc, h_dec)
        y = pad_list(ys, self.blank).type(torch.int32)

        z_len = to_device(self, torch.IntTensor(hlens))
        y_len = to_device(self, torch.IntTensor([_y.size(0) for _y in ys]))

        loss = to_device(self, self.rnnt_loss(z, y, z_len, y_len))

        return loss
Example #28
0
def prepare_inputs(idim, odim, ilens, olens, is_cuda=False):
    np.random.seed(1)

    feats = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens]
    labels = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens]
    feats_len = np.array([x.shape[0] for x in feats], dtype=np.int32)

    feats = pad_list([torch.from_numpy(x).float() for x in feats], 0)
    labels = pad_list([torch.from_numpy(y).long() for y in labels], -1)

    feats_len = torch.from_numpy(feats_len).long()

    if is_cuda:
        feats = feats.cuda()
        labels = labels.cuda()
        feats_len = feats_len.cuda()

    return feats, feats_len, labels
Example #29
0
    def __call__(self, batch, device):
        # batch should be located in list
        assert len(batch) == 1
        inputs_and_targets = batch[0]

        # parse inputs and targets
        xs, ys, spembs, spcs = inputs_and_targets

        # get list of lengths (must be tensor for DataParallel)
        ilens = torch.from_numpy(np.array([x.shape[0]
                                           for x in xs])).long().to(device)
        olens = torch.from_numpy(np.array([y.shape[0]
                                           for y in ys])).long().to(device)

        # perform padding and conversion to tensor
        xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device)
        ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device)

        # make labels for stop prediction
        labels = ys.new_zeros(ys.size(0), ys.size(1))
        for i, l in enumerate(olens):
            labels[i, l - 1:] = 1.0

        # prepare dict
        new_batch = {
            "xs": xs,
            "ilens": ilens,
            "ys": ys,
            "labels": labels,
            "olens": olens,
        }

        # load second target
        if spcs is not None:
            spcs = pad_list([torch.from_numpy(spc).float() for spc in spcs],
                            0).to(device)
            new_batch["spcs"] = spcs

        # load speaker embedding
        if spembs is not None:
            spembs = torch.from_numpy(np.array(spembs)).float().to(device)
            new_batch["spembs"] = spembs

        return new_batch
Example #30
0
def prepare_loss_inputs(ys_pad, hlens, blank_id=0, ignore_id=-1):
    """Prepare tensors for transducer loss computation.

    Args:
        ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
        hlens (torch.Tensor): batch of hidden sequence lengthts (B)
                              or batch of masks (B, 1, Tmax)
        blank_id (int): index of blank label
        ignore_id (int): index of initial padding

    Returns:
        ys_in_pad (torch.Tensor): batch of padded target sequences + blank (B, Lmax + 1)
        target (torch.Tensor): batch of padded target sequences (B, Lmax)
        pred_len (torch.Tensor): batch of hidden sequence lengths (B)
        target_len (torch.Tensor): batch of output sequence lengths (B)

    """
    device = ys_pad.device

    ys = [y[y != ignore_id] for y in ys_pad]

    blank = ys[0].new([blank_id])

    ys_in = [torch.cat([blank, y], dim=0) for y in ys]
    ys_in_pad = pad_list(ys_in, blank_id)

    target = pad_list(ys, blank_id).type(torch.int32)
    target_len = torch.IntTensor([y.size(0) for y in ys])

    if torch.is_tensor(hlens):
        if hlens.dim() > 1:
            hs = [h[h != 0] for h in hlens]
            hlens = list(map(int, [h.size(0) for h in hs]))
        else:
            hlens = list(map(int, hlens))

    pred_len = torch.IntTensor(hlens)

    pred_len = pred_len.to(device)
    target = target.to(device)
    target_len = target_len.to(device)

    return ys_in_pad, target, pred_len, target_len