Example #1
0
    def encode(self, xs, task='all', flip=False, use_cache=False, streaming=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all/ys*/ys_sub1*/ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
            use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state
            streaming (bool): streaming encoding
        Returns:
            eout_dict (dict):

        """
        if self.input_type == 'speech':
            # Frame stacking
            if self.n_stacks > 1:
                xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs]

            # Splicing
            if self.n_splices > 1:
                xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]
            xlens = torch.IntTensor([len(x) for x in xs])

            # Flip acoustic features in the reverse order
            if flip:
                xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs]
            else:
                xs = [np2tensor(x, self.device_id).float() for x in xs]
            xs = pad_list(xs, 0.)

            # SpecAugment
            if self.use_specaug and self.training:
                xs = self.specaug(xs)

            # Gaussian noise injection
            if self.gaussian_noise:
                xs = add_gaussian_noise(xs)

            # Sequence summary network
            if self.ssn is not None:
                xs += self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.dropout_emb(self.embed(xs))
            # TODO(hirofumi): fix for Transformer

        # encoder
        eout_dict = self.enc(xs, xlens, task.split('.')[0], use_cache, streaming)

        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']:
            for sub in ['sub1', 'sub2']:
                eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone()
                eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:]

        return eout_dict
Example #2
0
    def encode(self, xs, task='all', streaming=False, lookback=False, lookahead=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all/ys*/ys_sub1*/ys_sub2*
            streaming (bool): streaming encoding
            lookback (bool): truncate leftmost frames for lookback in CNN context
            lookahead (bool): truncate rightmost frames for lookahead in CNN context
        Returns:
            eout_dict (dict):

        """
        if self.input_type == 'speech':
            # Frame stacking
            if self.n_stacks > 1:
                xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs]

            # Splicing
            if self.n_splices > 1:
                xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]

            xlens = torch.IntTensor([len(x) for x in xs])
            xs = pad_list([np2tensor(x, self.device).float() for x in xs], 0.)

            # SpecAugment
            if self.specaug is not None and self.training:
                xs = self.specaug(xs)

            # Weight noise injection
            if self.weight_noise_std > 0 and self.training:
                self.add_weight_noise(std=self.weight_noise_std)

            # Input Gaussian noise injection
            if self.input_noise_std > 0 and self.training:
                xs = add_input_noise(xs, std=self.input_noise_std)

            # Sequence summary network
            if self.ssn is not None:
                xs = self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.dropout_emb(self.embed(xs))
            # TODO(hirofumi): fix for Transformer

        # encoder
        eout_dict = self.enc(xs, xlens, task.split('.')[0], streaming, lookback, lookahead)

        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv']:
            for sub in ['sub1', 'sub2']:
                eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone()
                eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:]

        return eout_dict
Example #3
0
    def collate_fn(self, batch):

        xs = []
        xlens = []
        ys = []
        ys_hist = []
        ys_sub1 = []
        ys_sub2 = []
        utt_ids = []
        speakers = []
        sessions = []
        text = []
        for item in batch:
            xs.append(item['xs'][0])
            xlens.append(item['xlens'][0])
            ys.append(item['ys'][0])
            ys_hist.append(item['ys_hist'][0])
            ys_sub1.append(item['ys_sub1'])
            ys_sub2.append(item['ys_sub2'])
            utt_ids.append(item['utt_ids'][0])
            speakers.append(item['speakers'][0])
            sessions.append(item['sessions'][0])
            text.append(item['text'])

        if self.num_stacks > 1:
            xs = [stack_frame(x, self.num_stacks, self.num_skips) for x in xs]

        # Splicing
        if self.num_splices > 1:
            xs = [splice(x, self.num_splices, self.num_stacks) for x in xs]

        data = {
            'xs': xs,
            'xlens': xlens,
            'ys': ys,
            'ys_hist': ys_hist,
            'ys_sub1': ys_sub1,
            'ys_sub2': ys_sub2,
            'utt_ids': utt_ids,
            'speakers': speakers,
            'sessions': sessions,
            'text': text
        }

        return data
Example #4
0
    def encode(self, xs, task='all', flip=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all or ys* or ys_sub1* or ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
        Returns:
            enc_outs (dict):

        """
        if 'lmobj' in task:
            eouts = {
                'ys': {
                    'xs': None,
                    'xlens': None
                },
                'ys_sub1': {
                    'xs': None,
                    'xlens': None
                },
                'ys_sub2': {
                    'xs': None,
                    'xlens': None
                }
            }
            return eouts
        else:
            if self.input_type == 'speech':
                # Frame stacking
                if self.n_stacks > 1:
                    xs = [
                        stack_frame(x, self.n_stacks, self.n_skips) for x in xs
                    ]

                # Splicing
                if self.n_splices > 1:
                    xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]
                xlens = torch.IntTensor([len(x) for x in xs])

                # Flip acoustic features in the reverse order
                if flip:
                    xs = [
                        torch.from_numpy(np.flip(
                            x, axis=0).copy()).float().cuda(self.device_id)
                        for x in xs
                    ]
                else:
                    xs = [np2tensor(x, self.device_id).float() for x in xs]
                xs = pad_list(xs, 0.0)

                # SpecAugment
                if self.is_specaug and self.training:
                    xs = self.specaug(xs)

                # Gaussian noise injection
                if self.gaussian_noise:
                    xs = add_gaussian_noise(xs)

                # Sequence summary network
                if self.ssn is not None:
                    xs += self.ssn(xs, xlens)

            elif self.input_type == 'text':
                xlens = torch.IntTensor([len(x) for x in xs])
                xs = [
                    np2tensor(np.fromiter(x, dtype=np.int64), self.device_id)
                    for x in xs
                ]
                xs = pad_list(xs, self.pad)
                xs = self.embed(xs)

            # encoder
            enc_outs = self.enc(xs, xlens, task.split('.')[0])

            if self.main_weight < 1 and self.enc_type in [
                    'conv', 'tds', 'gated_conv', 'transformer',
                    'conv_transformer'
            ]:
                for sub in ['sub1', 'sub2']:
                    enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone()
                    enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:]

            return enc_outs