Example #1
0
    def convert_linear_spectrogram_to_audio(self, spec, Ts=None):
        batch_size = spec.shape[0]

        T_max = spec.shape[2]
        if Ts is None:
            Ts = [T_max] * batch_size

        max_size = (max(Ts) - 1) * self.l_hop
        audios = torch.zeros(batch_size, max_size)
        # Lazy GL implementation. Could be improved by moving to pytorch.
        for i in range(batch_size):
            audio = griffin_lim(spec[i, :, 0:Ts[i]].cpu().numpy(),
                                n_iters=self.n_iters,
                                n_fft=self.n_fft)
            my_len = audio.shape[0]
            audios[i, 0:my_len] = torch.from_numpy(audio)

        return audios
Example #2
0
    def validation_step(self, batch, batch_idx):
        _, y_spec, _, _, T_ys, _, path_speech = batch

        x_mel = self.ed_mel2spec.spec_to_mel(y_spec)

        x_spec = self(mel=x_mel)
        z_mel = self.ed_mel2spec.spec_to_mel(x_spec)

        loss_L1 = self.calc_loss(x_spec, y_spec, T_ys, self.criterion)
        loss_reg = self.calc_loss(x_mel, z_mel, T_ys, self.criterion)

        loss = loss_L1 + self.lreg_factor * loss_reg

        output = {
            'val_loss': loss,
            'loss_L1': loss_L1,
            'loss_reg': loss_reg,
        }

        if self._cfg.train_params.validate_scores:
            '''
                For validaiton, estimate the wave using standard griffin lim,
                comparing the real wave with the griffin lim counterpart.
            '''

            cnt = x_spec.shape[0]
            np_x = x_spec.to('cpu').numpy()
            np_y = y_spec.to('cpu').numpy()
            stoi_real, pesq_real, stoi_est, pesq_est = (0.0, 0.0, 0.0, 0.0)

            for p in range(cnt):
                y_wav_path = path_speech[p]
                wav = sf.read(y_wav_path)[0].astype(np.float32)

                y_est_wav = griffin_lim(np_y[p, 0, :, :])
                x_est_wav = griffin_lim(np_x[p, 0, :, :])

                min_size = min(wav.shape[0], x_est_wav.shape[0],
                               y_est_wav.shape[0])
                wav = wav[0:min_size, ...]
                y_est_wav = y_est_wav[0:min_size, ...]
                x_est_wav = x_est_wav[0:min_size, ...]

                measure = eval_tts_scores(x_est_wav, wav)
                stoi_real += torch.tensor(measure['STOI'])
                pesq_real += torch.tensor(measure['PESQ'])

                measure = eval_tts_scores(x_est_wav, y_est_wav)
                stoi_est += torch.tensor(measure['STOI'])
                pesq_est += torch.tensor(measure['PESQ'])

            output['stoi_real'] = stoi_real / cnt
            output['pesq_real'] = pesq_real / cnt
            output['stoi_est'] = stoi_est / cnt
            output['pesq_est'] = pesq_est / cnt

        for (k, s) in self.f_specs:
            new_loss = self.calc_loss_smooth(x_spec, y_spec, T_ys, k, s)
            output[f'loss_{k}_{s}'] = new_loss
            loss = loss + new_loss

        output['val_loss'] = loss

        return output