Example #1
0
    def append_sos_eos(self, ys, bwd=False, replace_sos=False):
        """Append <sos> and <eos> and return padded sequences.
        Args:
            ys (list): A list of length `[B]`, which contains a list of size `[L]`
        Returns:
            ys_in_pad (LongTensor): `[B, L]`
            ys_out_pad (LongTensor): `[B, L]`
            ylens (IntTensor): `[B]`

        """
        w = next(self.parameters())
        eos = w.new_zeros(1).fill_(self.eos).long()
        ys = [
            np2tensor(np.fromiter(y[::-1] if bwd else y, dtype=np.int64),
                      self.device_id) for y in ys
        ]
        if replace_sos:
            ylens = np2tensor(
                np.fromiter([y[1:].size(0) + 1 for y in ys],
                            dtype=np.int32))  # +1 for <eos>
            ys_in_pad = pad_list([y for y in ys], self.pad)
            ys_out_pad = pad_list([torch.cat([y[1:], eos], dim=0) for y in ys],
                                  self.pad)
        else:
            ylens = np2tensor(
                np.fromiter([y.size(0) + 1 for y in ys],
                            dtype=np.int32))  # +1 for <eos>
            ys_in_pad = pad_list([torch.cat([eos, y], dim=0) for y in ys],
                                 self.pad)
            ys_out_pad = pad_list([torch.cat([y, eos], dim=0) for y in ys],
                                  self.pad)
        return ys_in_pad, ys_out_pad, ylens
Example #2
0
def test_forward(args):
    args = make_args(**args)

    batch_size = 4
    xmax = 40
    device = "cpu"

    xs = [
        np.random.randn(xlen, args['input_dim']).astype(np.float32)
        for xlen in range(xmax - batch_size, xmax)
    ]
    xs_pad = pad_list([np2tensor(x, device).float() for x in xs], 0.)

    stack_module = importlib.import_module(
        'neural_sp.models.seq2seq.frontends.frame_stacking')
    splice_module = importlib.import_module(
        'neural_sp.models.seq2seq.frontends.splicing')

    xs = [
        stack_module.stack_frame(x, args['n_stacks'], args['n_stacks'])
        for x in xs
    ]
    out = [
        splice_module.splice(x, args['n_splices'], args['n_stacks'])
        for x in xs
    ]
    out_pad = pad_list([np2tensor(x, device).float() for x in out], 0.)
    assert out_pad.size(0) == xs_pad.size(0)
    assert out_pad.size(1) == math.ceil(xs_pad.size(1) / args['n_stacks'])
    assert out_pad.size(
        2) == xs_pad.size(2) * args['n_splices'] * args['n_stacks']
Example #3
0
    def forward_att(self, eouts, elens, ys):
        """Compute XE loss for the sequence-to-sequence model.

        Args:
            eouts (FloatTensor): `[B, T, d_model]`
            elens (list): A list of length `[B]`
            ys (list): A list of length `[B]`, which contains a list of size `[L]`
        Returns:
            loss (FloatTensor): `[1]`
            acc (float):
            ppl (float):

        """
        bs = eouts.size(0)

        # Append <sos> and <eos>
        eos = eouts.new_zeros((1,)).fill_(self.eos).long()
        ylens = [len(y) for y in ys]
        ys = [np2tensor(np.fromiter(y[::-1] if self.backward else y, dtype=np.int64), self.device_id).long()
              for y in ys]
        ys_in = [torch.cat([eos, y], dim=0) for y in ys]
        ys_out = [torch.cat([y, eos], dim=0) for y in ys]
        ys_in_pad = pad_list(ys_in, self.pad)
        ys_out_pad = pad_list(ys_out, self.pad)

        # Add positional embedding
        ys_emb = self.embed(ys_in_pad) * (self.d_model ** 0.5)
        if self.pe_type:
            ys_emb = self.pos_emb_out(ys_emb)

        for l in range(self.n_layers):
            ys_emb, yy_aw, xy_aw = self.layers[l](eouts, elens, ys_emb, ylens)

        logits = self.norm_top(ys_emb)
        if self.adaptive_softmax is None:
            logits = self.output(logits)

        # Compute XE sequence loss
        if self.adaptive_softmax is None:
            if self.lsm_prob > 0:
                # Label smoothing
                loss = cross_entropy_lsm(logits, ys_out_pad,
                                         ylens=[y.size(0) for y in ys_out],
                                         lsm_prob=self.lsm_prob, size_average=False) / bs
            else:
                loss = F.cross_entropy(logits.view((-1, logits.size(2))), ys_out_pad.view(-1),
                                       ignore_index=self.pad, size_average=False) / bs
        else:
            loss = self.adaptive_softmax(logits.view((-1, logits.size(2))),
                                         ys_out_pad.view(-1)).loss

        # Compute token-level accuracy in teacher-forcing
        if self.adaptive_softmax is None:
            acc = compute_accuracy(logits, ys_out_pad, pad=self.pad)
        else:
            acc = compute_accuracy(self.adaptive_softmax.log_prob(
                logits.view((-1, logits.size(2)))), ys_out_pad, pad=self.pad)
        ppl = min(np.exp(loss.item()), np.inf)

        return loss, acc, ppl
Example #4
0
    def lm_rescoring(hyps, lm, lm_weight, reverse=False, length_norm=False, tag=''):
        if lm is None:
            return hyps
        for i in range(len(hyps)):
            ys = hyps[i]['hyp']  # include <sos>
            if reverse:
                ys = ys[::-1]

            ys = [np2tensor(np.fromiter(ys, dtype=np.int64), lm.device)]
            ys_in = pad_list([y[:-1] for y in ys], -1)  # `[1, L-1]`
            ys_out = pad_list([y[1:] for y in ys], -1)  # `[1, L-1]`

            if ys_in.size(1) > 0:
                _, _, scores_lm = lm.predict(ys_in, None)
                score_lm = sum([scores_lm[0, t, ys_out[0, t]] for t in range(ys_out.size(1))])
                if length_norm:
                    score_lm /= ys_out.size(1)  # normalize by length
            else:
                score_lm = 0

            hyps[i]['score'] += score_lm * lm_weight
            hyps[i]['score_lm_' + tag] = score_lm

        # DO NOT sort here !!!
        return hyps
Example #5
0
    def encode(self, xs, task='all', flip=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all or ys* or ys_sub1* or ys_sub2* or ys_sub3*
            flip (bool): if True, flip acoustic features in the time-dimension
        Returns:
            enc_outs (dict):

        """
        if 'lmobj' in task:
            eouts = {'ys': {'xs': None, 'xlens': None},
                     'ys_sub1': {'xs': None, 'xlens': None},
                     'ys_sub2': {'xs': None, 'xlens': None},
                     'ys_sub3': {'xs': None, 'xlens': None}}
            return eouts
        else:
            if self.input_type == 'speech':
                # Frame stacking
                if self.n_stacks > 1:
                    xs = [stack_frame(x, self.n_stacks, self.n_skips)for x in xs]

                # Splicing
                if self.n_splices > 1:
                    xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]

                xlens = [len(x) for x in xs]
                # Flip acoustic features in the reverse order
                if flip:
                    xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs]
                else:
                    xs = [np2tensor(x, self.device_id).float() for x in xs]
                xs = pad_list(xs, 0.0)

            elif self.input_type == 'text':
                xlens = [len(x) for x in xs]
                xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id).long() for x in xs]
                xs = pad_list(xs, self.pad)
                xs = self.embed_in(xs)

            enc_outs = self.enc(xs, xlens, task.split('.')[0])

            if self.main_weight < 1 and self.enc_type in ['cnn', 'tds']:
                for sub in ['sub1', 'sub2', 'sub3']:
                    enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone()
                    enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:]

            # Bridge between the encoder and decoder
            if self.main_weight > 0 and self.is_bridge and (task in ['all', 'ys']):
                enc_outs['ys']['xs'] = self.bridge(enc_outs['ys']['xs'])
            if self.sub1_weight > 0 and self.is_bridge and (task in ['all', 'ys_sub1']):
                enc_outs['ys_sub1']['xs'] = self.bridge_sub1(enc_outs['ys_sub1']['xs'])
            if self.sub2_weight > 0 and self.is_bridge and (task in ['all', 'ys_sub2']):
                enc_outs['ys_sub2']['xs'] = self.bridge_sub2(enc_outs['ys_sub2']['xs'])
            if self.sub3_weight > 0 and self.is_bridge and (task in ['all', 'ys_sub3']):
                enc_outs['ys_sub3']['xs'] = self.bridge_sub3(enc_outs['ys_sub3']['xs'])

            return enc_outs
Example #6
0
    def encode(self, xs, task='all', flip=False, use_cache=False, streaming=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all/ys*/ys_sub1*/ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
            use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state
            streaming (bool): streaming encoding
        Returns:
            eout_dict (dict):

        """
        if self.input_type == 'speech':
            # Frame stacking
            if self.n_stacks > 1:
                xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs]

            # Splicing
            if self.n_splices > 1:
                xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]
            xlens = torch.IntTensor([len(x) for x in xs])

            # Flip acoustic features in the reverse order
            if flip:
                xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs]
            else:
                xs = [np2tensor(x, self.device_id).float() for x in xs]
            xs = pad_list(xs, 0.)

            # SpecAugment
            if self.use_specaug and self.training:
                xs = self.specaug(xs)

            # Gaussian noise injection
            if self.gaussian_noise:
                xs = add_gaussian_noise(xs)

            # Sequence summary network
            if self.ssn is not None:
                xs += self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.dropout_emb(self.embed(xs))
            # TODO(hirofumi): fix for Transformer

        # encoder
        eout_dict = self.enc(xs, xlens, task.split('.')[0], use_cache, streaming)

        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']:
            for sub in ['sub1', 'sub2']:
                eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone()
                eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:]

        return eout_dict
Example #7
0
    def forward_rnnt(self, eouts, elens, ys):
        """Compute XE loss for the attention-based sequence-to-sequence model.

        Args:
            eouts (FloatTensor): `[B, T, dec_n_units]`
            elens (IntTensor): `[B]`
            ys (list): A list of length `[B]`, which contains a list of size `[L]`
        Returns:
            loss (FloatTensor): `[1]`

        """
        # Append <sos> and <eos>
        eos = eouts.new_zeros(1).fill_(self.eos).long()
        if self.end_pointing:
            _ys = [
                np2tensor(np.fromiter(y + [self.eos], dtype=np.int64),
                          self.device_id) for y in ys
            ]
        else:
            _ys = [
                np2tensor(np.fromiter(y, dtype=np.int64), self.device_id)
                for y in ys
            ]
        ylens = np2tensor(np.fromiter([y.size(0) for y in _ys],
                                      dtype=np.int32))
        ys_in_pad = pad_list([torch.cat([eos, y], dim=0) for y in _ys],
                             self.pad)
        ys_out_pad = pad_list(_ys, 0).int()  # int for warprnnt_loss

        # Update prediction network
        dout, _ = self.recurrency(self.embed(ys_in_pad), None)

        # Compute output distribution
        logits = self.joint(eouts, dout)

        # Compute Transducer loss
        log_probs = F.log_softmax(logits, dim=-1)
        if self.device_id >= 0:
            ys_out_pad = ys_out_pad.cuda(self.device_id)
            elens = elens.cuda(self.device_id)
            ylens = ylens.cuda(self.device_id)

        assert log_probs.size(2) == ys_out_pad.size(1) + 1
        loss = self.warprnnt_loss(log_probs, ys_out_pad.int(), elens, ylens)
        # NOTE: Transducer loss has already been normalized by bs
        # NOTE: index 0 is reserved for blank in warprnnt_pytorch
        # if self.device_id >= 0:
        #     loss = loss.cuda(self.device_id)

        # Label smoothing for Transducer
        # if self.lsm_prob > 0:
        #     loss = loss * (1 - self.lsm_prob) + kldiv_lsm_ctc(logits,
        #                                                       ylens=elens,
        #                                                       size_average=True) * self.lsm_prob
        # TODO(hirofumi): this leads to out of memory

        return loss
Example #8
0
    def encode(self, xs, task='all', streaming=False, lookback=False, lookahead=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all/ys*/ys_sub1*/ys_sub2*
            streaming (bool): streaming encoding
            lookback (bool): truncate leftmost frames for lookback in CNN context
            lookahead (bool): truncate rightmost frames for lookahead in CNN context
        Returns:
            eout_dict (dict):

        """
        if self.input_type == 'speech':
            # Frame stacking
            if self.n_stacks > 1:
                xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs]

            # Splicing
            if self.n_splices > 1:
                xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]

            xlens = torch.IntTensor([len(x) for x in xs])
            xs = pad_list([np2tensor(x, self.device).float() for x in xs], 0.)

            # SpecAugment
            if self.specaug is not None and self.training:
                xs = self.specaug(xs)

            # Weight noise injection
            if self.weight_noise_std > 0 and self.training:
                self.add_weight_noise(std=self.weight_noise_std)

            # Input Gaussian noise injection
            if self.input_noise_std > 0 and self.training:
                xs = add_input_noise(xs, std=self.input_noise_std)

            # Sequence summary network
            if self.ssn is not None:
                xs = self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.dropout_emb(self.embed(xs))
            # TODO(hirofumi): fix for Transformer

        # encoder
        eout_dict = self.enc(xs, xlens, task.split('.')[0], streaming, lookback, lookahead)

        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv']:
            for sub in ['sub1', 'sub2']:
                eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone()
                eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:]

        return eout_dict
Example #9
0
    def forward_transducer(self, eouts, elens, ys):
        """Compute Transducer loss.

        Args:
            eouts (FloatTensor): `[B, T, enc_n_units]`
            elens (IntTensor): `[B]`
            ys (list): length `B`, each of which contains a list of size `[L]`
        Returns:
            loss (FloatTensor): `[1]`

        """
        # Append <sos> and <eos>
        _ys = [
            np2tensor(np.fromiter(y, dtype=np.int64), eouts.device) for y in ys
        ]
        ylens = np2tensor(np.fromiter([y.size(0) for y in _ys],
                                      dtype=np.int32))
        eos = eouts.new_zeros((1, ), dtype=torch.int64).fill_(self.eos)
        ys_in = pad_list([torch.cat([eos, y], dim=0) for y in _ys],
                         self.pad)  # `[B, L+1]`
        ys_out = pad_list(_ys, self.blank)  # `[B, L]`

        # Update prediction network
        ys_emb = self.dropout_emb(self.embed(ys_in))
        dout, _ = self.recurrency(ys_emb, None)

        # Compute output distribution
        logits = self.joint(eouts, dout)  # `[B, T, L+1, vocab]`

        # Compute Transducer loss
        log_probs = torch.log_softmax(logits, dim=-1)
        assert log_probs.size(2) == ys_out.size(1) + 1
        if self.device_id >= 0:
            ys_out = ys_out.to(eouts.device)
            elens = elens.to(eouts.device)
            ylens = ylens.to(eouts.device)
            import warp_rnnt
            loss = warp_rnnt.rnnt_loss(log_probs,
                                       ys_out.int(),
                                       elens,
                                       ylens,
                                       average_frames=False,
                                       reduction='mean',
                                       gather=False)
        else:
            import warprnnt_pytorch
            self.warprnnt_loss = warprnnt_pytorch.RNNTLoss()
            loss = self.warprnnt_loss(log_probs, ys_out.int(), elens, ylens)
            # NOTE: Transducer loss has already been normalized by bs
            # NOTE: index 0 is reserved for blank in warprnnt_pytorch

        return loss
Example #10
0
    def encode(self, xs, task='all', flip=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all or ys* or ys_sub1* or ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
        Returns:
            enc_outs (dict):

        """
            
        if self.input_type == 'speech':

            xlens = torch.IntTensor([len(x) for x in xs])
            # Flip acoustic features in the reverse order
            if flip:
                xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs]
            else:
                xs = [np2tensor(x, self.device_id).float() for x in xs]
            xs = pad_list(xs, 0.0)

            # SpecAugment
            if self.is_specaug and self.training:
                xs = self.specaug(xs)

            # Gaussian noise injection
            if self.gaussian_noise:
                xs = add_gaussian_noise(xs)

            # Sequence summary network
            if self.ssn is not None:
                xs += self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.embed(xs)
        
        # encoder
        enc_outs = self.enc(xs, xlens, task.split('.')[0])
        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']:
            for sub in ['sub1', 'sub2']:
                enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone()
                enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:]
        del xs 
        return enc_outs
Example #11
0
def test_forward(args):
    args = make_args(**args)

    batch_size = 4
    xmaxs = [40, 45] if args['chunk_size_left'] == -1 else [1600, 1655]
    device_id = -1
    module = importlib.import_module(
        'neural_sp.models.seq2seq.encoders.conformer')
    enc = module.ConformerEncoder(**args)
    for xmax in xmaxs:
        xs = np.random.randn(batch_size, xmax,
                             args['input_dim']).astype(np.float32)
        xlens = torch.IntTensor([len(x) for x in xs])
        xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.)
        enc_out_dict = enc(xs, xlens, task='all')

        assert enc_out_dict['ys']['xs'].size(0) == batch_size, xs.size()
        assert enc_out_dict['ys']['xs'].size(
            1) == enc_out_dict['ys']['xlens'][0], xs.size()
        if args['n_layers_sub1'] > 0:
            assert enc_out_dict['ys_sub1']['xs'].size(
                0) == batch_size, xs.size()
            assert enc_out_dict['ys_sub1']['xs'].size(
                1) == enc_out_dict['ys_sub1']['xlens'][0], xs.size()
        if args['n_layers_sub2'] > 0:
            assert enc_out_dict['ys_sub2']['xs'].size(
                0) == batch_size, xs.size()
            assert enc_out_dict['ys_sub2']['xs'].size(
                1) == enc_out_dict['ys_sub2']['xlens'][0], xs.size()
Example #12
0
    def lm_rescoring(self, hyps, lm, lm_weight, reverse=False, tag=''):
        for i in range(len(hyps)):
            ys = hyps[i]['hyp']  # include <sos>
            if reverse:
                ys = ys[::-1]
            ys = [np2tensor(np.fromiter(ys, dtype=np.int64), self.device_id)]
            ys_in = pad_list([y[:-1] for y in ys], -1)  # `[1, L-1]`
            ys_out = pad_list([y[1:] for y in ys], -1)  # `[1, L-1]`

            lmout, lmstate, scores_lm = lm.predict(ys_in, None)
            score_lm = sum(
                [scores_lm[0, t, ys_out[0, t]] for t in range(ys_out.size(1))])
            score_lm /= ys_out.size(1)

            hyps[i]['score'] += score_lm * lm_weight
            hyps[i]['score_lm_' + tag] = score_lm
Example #13
0
    def __call__(self, logits, elens, ys, ylens):
        """Forced alignment with references.

        Args:
            logits (FloatTensor): `[B, T, vocab]`
            elens (List): length `[B]`
            ys (List): length `[B]`, each of which contains a list of size `[L]`
            ylens (List): length `[B]`
        Returns:
            trigger_points (IntTensor): `[B, L]`

        """
        with torch.no_grad():
            ys = [
                np2tensor(np.fromiter(y, dtype=np.int64), logits.device)
                for y in ys
            ]
            ys_in_pad = pad_list(ys, 0)

            # zero padding
            mask = make_pad_mask(elens.to(logits.device))
            mask = mask.unsqueeze(2).expand_as(logits)
            logits = logits.masked_fill_(mask == 0, self.log0)
            log_probs = torch.log_softmax(logits, dim=-1).transpose(
                0, 1)  # `[T, B, vocab]`

            trigger_points = self.align(log_probs, elens, ys_in_pad, ylens)
        return trigger_points
Example #14
0
def test_forward(args):
    args = make_args(**args)

    batch_size = 4
    xmaxs = [40, 45] if int(args['chunk_size_left'].split('_')[0]) == -1 else [400, 455]
    device = "cpu"

    module = importlib.import_module('neural_sp.models.seq2seq.encoders.rnn')
    enc = module.RNNEncoder(**args)
    enc = enc.to(device)

    for xmax in xmaxs:
        xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32)
        xlens = torch.IntTensor([len(x) - i * enc.subsampling_factor for i, x in enumerate(xs)])

        # shuffle
        perm_ids = torch.randperm(batch_size)
        xs = xs[perm_ids]
        xlens = xlens[perm_ids]

        xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)
        enc_out_dict = enc(xs, xlens, task='all')

        assert enc_out_dict['ys']['xs'].size(0) == batch_size
        assert enc_out_dict['ys']['xs'].size(1) == enc_out_dict['ys']['xlens'].max()
        for b in range(batch_size):
            if 'conv' in args['enc_type'] or args['subsample_type'] in ['max_pool', '1dconv', 'drop', 'add']:
                assert enc_out_dict['ys']['xlens'][b].item() == math.ceil(xlens[b].item() / enc.subsampling_factor)
            else:
                assert enc_out_dict['ys']['xlens'][b].item() == xlens[b].item() // enc.subsampling_factor

        if args['n_layers_sub1'] > 0:
            # all outputs
            assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size
            assert enc_out_dict['ys_sub1']['xs'].size(1) == enc_out_dict['ys_sub1']['xlens'].max()
            for b in range(batch_size):
                if 'conv' in args['enc_type'] or args['subsample_type'] in ['max_pool', '1dconv', 'drop', 'add']:
                    assert enc_out_dict['ys_sub1']['xlens'][b].item() == math.ceil(
                        xlens[b].item() / enc.subsampling_factor_sub1)
                else:
                    assert enc_out_dict['ys_sub1']['xlens'][b].item() == xlens[b].item() // enc.subsampling_factor_sub1
            # single output
            enc_out_dict_sub1 = enc(xs, xlens, task='ys_sub1')
            assert enc_out_dict_sub1['ys_sub1']['xs'].size(0) == batch_size
            assert enc_out_dict_sub1['ys_sub1']['xs'].size(1) == enc_out_dict['ys_sub1']['xlens'].max()

        if args['n_layers_sub2'] > 0:
            # all outputs
            assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size
            assert enc_out_dict['ys_sub2']['xs'].size(1) == enc_out_dict['ys_sub2']['xlens'].max()
            for b in range(batch_size):
                if 'conv' in args['enc_type'] or args['subsample_type'] in ['max_pool', '1dconv', 'drop', 'add']:
                    assert enc_out_dict['ys_sub2']['xlens'][b].item() == math.ceil(
                        xlens[b].item() / enc.subsampling_factor_sub2)
                else:
                    assert enc_out_dict['ys_sub2']['xlens'][b].item() == xlens[b].item() // enc.subsampling_factor_sub2
            # single output
            enc_out_dict_sub2 = enc(xs, xlens, task='ys_sub2')
            assert enc_out_dict_sub2['ys_sub2']['xs'].size(0) == batch_size
            assert enc_out_dict_sub2['ys_sub2']['xs'].size(1) == enc_out_dict_sub2['ys_sub2']['xlens'].max()
Example #15
0
def test_blockwise(args):
    args = make_args(**args)

    batch_size = 4
    xmaxs = [1600, 1655]
    device_id = -1
    module = importlib.import_module(
        'neural_sp.models.seq2seq.encoders.transformer')

    N_l = args['chunk_size_left']
    N_c = args['chunk_size_current']
    N_r = args['chunk_size_right']

    for xmax in xmaxs:
        xs = np.random.randn(batch_size, xmax,
                             args['input_dim']).astype(np.float32)
        xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.)

        xs_block = module.blockwise(xs, N_l, N_c, N_r)

        # Extract the center region
        xs_block = xs_block[:,
                            N_l:N_l + N_c]  # `[B * n_blocks, N_c, input_dim]`
        xs_block = xs_block.contiguous().view(batch_size, -1, xs_block.size(2))
        xs_block = xs_block[:, :xmax]

        assert xs_block.size() == xs.size()
        assert torch.equal(xs_block, xs)
Example #16
0
def test_transformer_forward(args):
    args = make_transformer_args(**args)

    batch_size = 4
    xmax = 40 if args['chunk_size_left'] == -1 else 1600
    device_id = -1
    xs = np.random.randn(batch_size, xmax,
                         args['input_dim']).astype(np.float32)
    xlens = torch.IntTensor([len(x) for x in xs])
    xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.)

    transformer = importlib.import_module(
        'neural_sp.models.seq2seq.encoders.transformer')
    enc = transformer.TransformerEncoder(**args)
    enc_out_dict = enc(xs, xlens, task='all')
    assert enc_out_dict['ys']['xs'].size(0) == batch_size
    assert enc_out_dict['ys']['xs'].size(1) == enc_out_dict['ys']['xlens'][0]
    if args['n_layers_sub1'] > 0:
        assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size
        assert enc_out_dict['ys_sub1']['xs'].size(
            1) == enc_out_dict['ys_sub1']['xlens'][0]
    if args['n_layers_sub2'] > 0:
        assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size
        assert enc_out_dict['ys_sub2']['xs'].size(
            1) == enc_out_dict['ys_sub2']['xlens'][0]
Example #17
0
def test_forward_2d(args):
    args = make_args_2d(**args)

    batch_size = 4
    xmaxs = [40, 45]
    device = "cpu"

    module = importlib.import_module('neural_sp.models.seq2seq.encoders.conv')
    (channels, kernel_sizes, strides,
     poolings), is_1dconv = module.parse_cnn_config(args['channels'],
                                                    args['kernel_sizes'],
                                                    args['strides'],
                                                    args['poolings'])
    assert not is_1dconv
    enc = module.ConvEncoder(**args)
    enc = enc.to(device)

    for xmax in xmaxs:
        xs = np.random.randn(batch_size, xmax,
                             args['input_dim']).astype(np.float32)
        xlens = torch.IntTensor([len(x) for x in xs])
        xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)
        xs, xlens = enc(xs, xlens)

        assert xs.size(0) == batch_size, xs.size()
        assert xs.size(1) == xlens[0], xs.size()
Example #18
0
def test_forward(args):
    args = make_args(**args)

    batch_size = 4
    emax = 40
    device = "cpu"

    eouts = np.random.randn(batch_size, emax, ENC_N_UNITS).astype(np.float32)
    elens = torch.IntTensor([len(x) for x in eouts])
    eouts = pad_list([np2tensor(x, device).float() for x in eouts], 0.)
    ylens = [4, 5, 3, 7]
    ys = [np.random.randint(0, VOCAB, ylen).astype(np.int32) for ylen in ylens]

    if args['lm_init'] or args['lm_fusion']:
        args_lm = make_args_rnnlm()
        module_rnnlm = importlib.import_module('neural_sp.models.lm.rnnlm')
        args['external_lm'] = module_rnnlm.RNNLM(args_lm).to(device)

    module = importlib.import_module('neural_sp.models.seq2seq.decoders.las')
    dec = module.RNNDecoder(**args)
    dec = dec.to(device)

    loss, observation = dec(eouts, elens, ys, task='all')
    assert loss.dim() == 1
    assert loss.size(0) == 1
    assert loss.item() >= 0
    assert isinstance(observation, dict)
Example #19
0
def test_forward(args):
    args = make_args(**args)

    batch_size = 4
    xmaxs = [40, 45] if args['chunk_size_left'] == -1 else [400, 455]
    device = "cpu"

    module = importlib.import_module('neural_sp.models.seq2seq.encoders.transformer')
    enc = module.TransformerEncoder(**args)
    enc = enc.to(device)

    for xmax in xmaxs:
        xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32)
        xlens = torch.IntTensor([len(x) - i * enc.subsampling_factor for i, x in enumerate(xs)])
        xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)

        # for mode in ['train', 'eval']:  # too slow
        for mode in ['train']:
            if mode == 'train':
                enc.train()
                enc_out_dict = enc(xs, xlens, task='all')
            elif mode == 'eval':
                enc.eval()
                with torch.no_grad():
                    enc_out_dict = enc(xs, xlens, task='all')
                    # enc._plot_attention()  # too slow

            assert enc_out_dict['ys']['xs'].size(0) == batch_size, xs.size()
            assert enc_out_dict['ys']['xs'].size(1) == enc_out_dict['ys']['xlens'][0], xs.size()
            if args['n_layers_sub1'] > 0:
                assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size, xs.size()
                assert enc_out_dict['ys_sub1']['xs'].size(1) == enc_out_dict['ys_sub1']['xlens'][0], xs.size()
            if args['n_layers_sub2'] > 0:
                assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size, xs.size()
                assert enc_out_dict['ys_sub2']['xs'].size(1) == enc_out_dict['ys_sub2']['xlens'][0], xs.size()
Example #20
0
def test_fixed_config_forward(args):
    args = make_args(**args)

    batch_size = 4
    xmax = 400
    input_dim = 80
    device = "cpu"

    xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32)
    xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)

    module = importlib.import_module(
        'neural_sp.models.seq2seq.frontends.spec_augment')
    specaug = module.SpecAugment(**args)

    # fixed setting
    specaug.librispeech_basic()
    out = specaug(xs)
    assert out.size() == xs.size()

    specaug.librispeech_double()
    out = specaug(xs)
    assert out.size() == xs.size()

    specaug.switchboard_mild()
    out = specaug(xs)
    assert out.size() == xs.size()

    specaug.switchboard_strong()
    out = specaug(xs)
    assert out.size() == xs.size()
Example #21
0
    def generate_probs(self, batch, lm=None, lm_weight=0, temperature=1):
        # Encode input features
        if self.input_type == 'speech':
            enc_outs = self.encode(batch['xs'], task='ys')
        else:
            enc_outs = self.encode(batch['ys_sub1'], task='ys')

        # for the forward decoder in the main task
        logits = self.dec_fwd.forward_att(enc_outs['ys']['xs'],
                                          enc_outs['ys']['xlens'],
                                          batch['ys'],
                                          return_logits=True)
        teacher_probs = torch.softmax(logits / temperature, dim=-1).data

        if lm is not None and lm_weight > 0:
            # Append <sos> and <eos>
            eos = logits.new_zeros(1).fill_(self.eos).long()
            _ys = [
                np2tensor(np.fromiter(y, dtype=np.int64), self.device_id)
                for y in batch['ys']
            ]
            ys_in = [torch.cat([eos, y], dim=0) for y in _ys]
            ys_in_pad = pad_list(ys_in, self.pad)
            lmout, _ = lm.decode(lm.encode(ys_in_pad), None)
            lm_probs = torch.softmax(lm.generate(lmout), dim=-1).data
            teacher_probs = (1 -
                             lm_weight) * teacher_probs + lm_weight * lm_probs

        return teacher_probs
Example #22
0
    def _forward(self, ys):
        if self.backward:
            ys = [
                np2tensor(np.fromiter(y[::-1], dtype=np.int64),
                          self.device_id).long() for y in ys
            ]
        else:
            ys = [
                np2tensor(np.fromiter(y, dtype=np.int64),
                          self.device_id).long() for y in ys
            ]

        ys = pad_list(ys, self.pad)
        ys_in = ys[:, :-1]
        ys_out = ys[:, 1:]

        # Path through embedding
        ys_in = self.embed(ys_in)

        if self.fast_impl:
            ys_in, _ = self.rnn(ys_in, hx=None)
            ys_in = self.dropout_top(ys_in)
        else:
            xs_lower = None
            for l in range(self.nlayers):
                # Path through RNN
                ys_in, _ = self.rnn[l](ys_in, hx=None)
                ys_in = self.dropout[l](ys_in)

                # Residual connection
                if self.residual and l > 0:
                    ys_in += xs_lower
                    xs_lower = ys_in
                # NOTE: Exclude residual connection from the raw inputs

        logits = self.output(ys_in)

        # Compute XE sequence loss
        loss = F.cross_entropy(logits.view((-1, logits.size(2))),
                               ys_out.contiguous().view(-1),
                               ignore_index=self.pad,
                               size_average=True)

        # Compute token-level accuracy in teacher-forcing
        pad_pred = logits.view(ys_out.size(0), ys_out.size(1),
                               logits.size(-1)).argmax(2)
        mask = ys_out != self.pad
        numerator = torch.sum(
            pad_pred.masked_select(mask) == ys_out.masked_select(mask))
        denominator = torch.sum(mask)
        acc = float(numerator) * 100 / float(denominator)

        observation = {
            'loss': loss.item(),
            'acc': acc,
            'ppl': math.exp(loss.item())
        }

        return loss, observation
Example #23
0
 def generate_lm_logits(self, ys, lm, temperature=5.0):
     # Append <sos> and <eos>
     eos = next(lm.parameters()).new_zeros(1).fill_(self.eos).long()
     ys = [np2tensor(np.fromiter(y, dtype=np.int64), self.device)for y in ys]
     ys_in = pad_list([torch.cat([eos, y], dim=0) for y in ys], self.pad)
     lmout, _ = lm.decode(ys_in, None)
     logits = lm.output(lmout)
     return logits
Example #24
0
    def forward_lmobj(self, ys):
        """Compute XE loss for LM objective.

        Args:
            ys (list): A list of length `[B]`, which contains a list of size `[L]`
        Returns:
            loss (FloatTensor): `[1]`
            acc (float): accuracy
            ppl (float): perplexity

        """
        w = next(self.parameters())

        # Append <sos> and <eos>
        eos = w.new_zeros(1).fill_(self.eos)
        ys = [
            np2tensor(np.fromiter(y, dtype=np.int64), self.device_id)
            for y in ys
        ]
        ylens = np2tensor(
            np.fromiter([y.size(0) + 1 for y in ys],
                        dtype=np.int32))  # +1 for <eos>
        ys_in_pad = pad_list([torch.cat([eos, y], dim=0) for y in ys],
                             self.pad)
        ys_out_pad = pad_list([torch.cat([y, eos], dim=0) for y in ys],
                              self.pad)

        # Update prediction network
        dout, _ = self.recurrency(self.embed(ys_in_pad), None)
        logits = self.output_lmobj(dout)

        # Compute XE loss for LM objective
        loss = F.cross_entropy(logits.view((-1, logits.size(2))),
                               ys_out_pad.view(-1),
                               ignore_index=self.pad,
                               size_average=True)

        # Compute token-level accuracy in teacher-forcing
        acc = compute_accuracy(logits, ys_out_pad, self.pad)
        ppl = min(np.exp(loss.item()), np.inf)

        # scale loss for CTC
        loss *= ylens.float().mean()

        return loss, acc, ppl
Example #25
0
def test_streaming_decoding(params):
    args = make_args(attn_type='mocha')
    params = make_decode_params(**params)

    batch_size = params['recog_batch_size']
    emax = 400
    device = "cpu"

    eouts = np.random.randn(batch_size, emax, ENC_N_UNITS).astype(np.float32)
    eouts = pad_list([np2tensor(x, device).float() for x in eouts], 0.)

    ctc_log_probs = None
    if params['recog_ctc_weight'] > 0:
        ctc_log_probs = torch.FloatTensor(batch_size,
                                          emax,
                                          VOCAB,
                                          device=device)

    args_lm = make_args_rnnlm()
    module_rnnlm = importlib.import_module('neural_sp.models.lm.rnnlm')
    lm = None
    if params['recog_lm_weight'] > 0:
        lm = module_rnnlm.RNNLM(args_lm).to(device)
    if args['lm_fusion']:
        args['external_lm'] = module_rnnlm.RNNLM(args_lm).to(device)

    module = importlib.import_module('neural_sp.models.seq2seq.decoders.las')
    dec = module.RNNDecoder(**args)
    dec = dec.to(device)

    N_l = 5
    n_chunks = math.ceil(emax / N_l)
    hyps = None

    module_bs = importlib.import_module(
        'neural_sp.models.seq2seq.decoders.beam_search')
    helper = module_bs.BeamSearch(params['recog_beam_width'], dec.eos,
                                  params['recog_ctc_weight'],
                                  params['recog_lm_weight'], device)

    dec.eval()
    with torch.no_grad():
        for chunk_idx in range(n_chunks):
            eouts_chunk = eouts[:, N_l * chunk_idx:N_l * (chunk_idx + 1)]
            out = dec.beam_search_block_sync(eouts_chunk,
                                             params,
                                             helper,
                                             idx2token,
                                             hyps,
                                             lm,
                                             ctc_log_probs=ctc_log_probs)
            assert len(out) == 3
            end_hyps, hyps, _ = out
            assert isinstance(end_hyps, list)
            assert isinstance(hyps, list)
Example #26
0
def test_forward():
    batch_size = 4
    xmax = 40
    input_dim = 80
    device = "cpu"

    xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32)
    xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)

    out = add_input_noise(xs, std=0.075)
    assert out.size() == xs.size()
Example #27
0
def test_forward(args):
    args = make_args(**args)

    batch_size = 4
    xmaxs = [40, 45] if args['chunk_size_left'] == -1 else [800, 855]
    device_id = -1
    module = importlib.import_module('neural_sp.models.seq2seq.encoders.rnn')
    enc = module.RNNEncoder(**args)
    for xmax in xmaxs:
        xs = np.random.randn(batch_size, xmax,
                             args['input_dim']).astype(np.float32)
        xlens = torch.IntTensor(
            [len(x) - i * enc.subsampling_factor for i, x in enumerate(xs)])
        xs = pad_list([np2tensor(x, device_id).float() for x in xs], 0.)
        enc_out_dict = enc(xs, xlens, task='all')

        assert enc_out_dict['ys']['xs'].size(0) == batch_size
        assert enc_out_dict['ys']['xs'].size(
            1) == enc_out_dict['ys']['xlens'].max()
        for b in range(batch_size):
            if 'conv' in args['rnn_type'] or args['subsample_type'] in [
                    'max_pool', '1dconv'
            ]:
                assert enc_out_dict['ys']['xlens'][b].item() == math.ceil(
                    xlens[b].item() / enc.subsampling_factor)
            else:
                assert enc_out_dict['ys']['xlens'][b].item() == math.floor(
                    xlens[b].item() / enc.subsampling_factor)
        if args['n_layers_sub1'] > 0:
            assert enc_out_dict['ys_sub1']['xs'].size(0) == batch_size
            assert enc_out_dict['ys_sub1']['xs'].size(
                1) == enc_out_dict['ys_sub1']['xlens'].max()
            for b in range(batch_size):
                if 'conv' in args['rnn_type'] or args['subsample_type'] in [
                        'max_pool', '1dconv'
                ]:
                    assert enc_out_dict['ys_sub1']['xlens'][b].item(
                    ) == math.ceil(xlens[b].item() / enc.subsampling_factor)
                else:
                    assert enc_out_dict['ys_sub1']['xlens'][b].item(
                    ) == math.floor(xlens[b].item() / enc.subsampling_factor)
        if args['n_layers_sub2'] > 0:
            assert enc_out_dict['ys_sub2']['xs'].size(0) == batch_size
            assert enc_out_dict['ys_sub2']['xs'].size(
                1) == enc_out_dict['ys_sub2']['xlens'].max()
            for b in range(batch_size):
                if 'conv' in args['rnn_type'] or args['subsample_type'] in [
                        'max_pool', '1dconv'
                ]:
                    assert enc_out_dict['ys_sub2']['xlens'][b].item(
                    ) == math.ceil(xlens[b].item() / enc.subsampling_factor)
                else:
                    assert enc_out_dict['ys_sub2']['xlens'][b].item(
                    ) == math.floor(xlens[b].item() / enc.subsampling_factor)
Example #28
0
    def forward(self, eouts, elens, ys, forced_align=False):
        """Compute CTC loss.

        Args:
            eouts (FloatTensor): `[B, T, dec_n_units]`
            elens (list): A list of length B
            ys (list): A list of length B, which contains a list of size `[L]`
        Returns:
            loss (FloatTensor): `[B, L, vocab]`

        """
        # Concatenate all elements in ys for warpctc_pytorch
        ylens = np2tensor(np.fromiter([len(y) for y in ys], dtype=np.int32))
        ys_ctc = torch.cat([
            np2tensor(np.fromiter(y[::-1] if self.bwd else y, dtype=np.int32))
            for y in ys
        ],
                           dim=0)
        # NOTE: do not copy to GPUs here

        # Compute CTC loss
        logits = self.output(eouts)
        loss = self.warpctc_loss(
            logits.transpose(1, 0),  # time-major
            ys_ctc,
            elens.cpu(),
            ylens)
        # NOTE: ctc loss has already been normalized by bs
        # NOTE: index 0 is reserved for blank in warpctc_pytorch
        if self.device_id >= 0:
            loss = loss.cuda(self.device_id)

        # Label smoothing for CTC
        if self.lsm_prob > 0:
            loss = loss * (1 - self.lsm_prob) + kldiv_lsm_ctc(
                logits, elens) * self.lsm_prob

        trigger_points = None
        if forced_align:
            ys = [
                np2tensor(np.fromiter(y, dtype=np.int64), self.device_id)
                for y in ys
            ]
            ys_in_pad = pad_list(ys, 0)  # pad by zero
            trigger_points = self.forced_aligner.align(logits.clone(), elens,
                                                       ys_in_pad, ylens)

        return loss, trigger_points
Example #29
0
def test_forward(args):
    args = make_args(**args)

    batch_size = 4
    xmax = 400
    input_dim = 80
    device = "cpu"

    xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32)
    xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)

    module = importlib.import_module(
        'neural_sp.models.seq2seq.frontends.spec_augment')
    specaug = module.SpecAugment(**args)

    out = specaug(xs)
    assert out.size() == xs.size()
Example #30
0
    def forced_align(self, logits, elens, ys, ylens):
        """Forced alignment with references.

        Args:
            logits (FloatTensor): `[B, T, vocab]`
            elens (List): length `B`
            ys (List): length `B`, each of which contains a list of size `[L]`
            ylens (List): length `B`
        Returns:
            trigger_points (IntTensor): `[B, L]`

        """
        with torch.no_grad():
            ys = [np2tensor(np.fromiter(y, dtype=np.int64), logits.device) for y in ys]
            ys_in_pad = pad_list(ys, 0)
            trigger_points = self.forced_aligner.align(logits.clone(), elens, ys_in_pad, ylens)
        return trigger_points