def test_stft_istft_identity(ctx, window_size, stride, fft_size, window_type, center, pad_mode): backend = ctx.backend[0].split(":")[0] if backend == 'cuda': pytest.skip( 'CUDA Convolution N-D is only supported in CUDNN extension') x_shape = create_stft_input_shape(window_size) x = np.random.randn(*x_shape) # Skip for NOLA condition violation length = x_shape[1] if is_nola_violation(window_type, window_size, stride, fft_size, length, center): pytest.skip('NOLA condition violation.') return x = nn.Variable.from_numpy_array(x) with nn.context_scope(ctx): yr, yi = F.stft(x, window_size, stride, fft_size, window_type, center, pad_mode) z = F.istft(yr, yi, window_size, stride, fft_size, window_type, center, pad_mode="constant") z.forward() assert (np.allclose(x.d, z.d, atol=1e-5, rtol=1e-5))
def istft_backward(inputs, window_size, stride, fft_size, window_type='hanning', center=True, pad_mode='reflect', as_stft_backward=False): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] dx_r, dx_i = F.stft(dy, window_size, stride, fft_size, window_type, center, pad_mode, as_istft_backward=not as_stft_backward) return dx_r, dx_i
def test_istft(ctx, window_size, stride, fft_size, window_type, center): backend = ctx.backend[0].split(":")[0] if backend == 'cuda': pytest.skip('CUDA Convolution N-D is only supported in CUDNN extension') # clear all previous STFT conv/deconv kernels nn.clear_parameters() # Make sure that iSTFT(STFT(x)) = x x = np.random.randn(1, window_size * 10) nx = nn.Variable.from_numpy_array(x) with nn.context_scope(ctx): nyr, nyi = F.stft(nx, window_size=window_size, stride=stride, fft_size=fft_size, window_type=window_type, center=center) nz = F.istft(nyr, nyi, window_size=window_size, stride=stride, fft_size=fft_size, window_type=window_type, center=center) nz.forward() invalid = window_size - stride assert(np.allclose(nx.d[:, invalid:-invalid], nz.d[:, invalid:-invalid], atol=1e-5, rtol=1e-5))
def test_stft(window_size, stride, fft_size, window_type): # clear all previous STFT conv/deconv kernels nn.clear_parameters() # Compare to `scipy.signal.stft` - only done if SciPy available x = np.random.randn(1, window_size * 10) nx = nn.Variable.from_numpy_array(x) nyr, nyi = F.stft(nx, window_size=window_size, stride=stride, fft_size=fft_size, window_type=window_type, center=False) nn.forward_all([nyr, nyi]) stft_nnabla = nyr.d + 1j * nyi.d _f, _t, stft_scipy = sig.stft(x, window=window_type, nperseg=window_size, noverlap=window_size - stride, nfft=fft_size, boundary=None, padded=False) # scipy does a different scaling - take care here stft_nnabla /= fft_size // 2 assert (np.allclose(stft_nnabla, stft_scipy, atol=1e-5, rtol=1e-5))
def test_istft(window_size, stride, fft_size, window_type, center): # clear all previous STFT conv/deconv kernels nn.clear_parameters() # Make sure that iSTFT(STFT(x)) = x x = np.random.randn(1, window_size * 10) nx = nn.Variable.from_numpy_array(x) nyr, nyi = F.stft(nx, window_size=window_size, stride=stride, fft_size=fft_size, window_type=window_type, center=center) nz = F.istft(nyr, nyi, window_size=window_size, stride=stride, fft_size=fft_size, window_type=window_type, center=center) nz.forward() invalid = window_size - stride assert (np.allclose(nx.d[:, invalid:-invalid], nz.d[:, invalid:-invalid], atol=1e-5, rtol=1e-5))
def compute_mel(self, wave): hp = self.hparams reals, imags = F.stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) linear = F.pow_scalar( F.add2(F.pow_scalar(reals, 2), F.pow_scalar(imags, 2)), 0.5) mels = F.batch_matmul(self.basis, linear) mels = F.log(F.clip_by_value(mels, 1e-5, np.inf)).apply(need_grad=False) return mels
def __init__(self, waveglow, hp): mel_input = F.constant(shape=[1, hp.n_mels, 88]) wave = waveglow.infer(mel_input, sigma=0) real, imag = F.stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) bias_spec = F.pow_scalar(real**2 + imag**2, 0.5) bias_spec.forward(clear_buffer=True) self.bias_spec = bias_spec.d.copy()[:, :, 0][0, :, None] self.hparams = hp
def ref_istft(y_r, y_i, window_size, stride, fft_size, window_type, center, pad_mode, as_stft_backward): if not as_stft_backward: # Use librosa.istft as the forward reference. # Convert to librosa.istft input format. y = y_r + 1j * y_i # Get original signal length. x_shape = create_stft_input_shape(window_size) length = x_shape[1] # librosa.istft does not support batched input. b = y.shape[0] xs = [] for i in range(b): x = librosa.istft(y[i], hop_length=stride, win_length=window_size, window=window_type, center=center, length=length) xs.append(x) return np.array(xs) else: # Use F.stft backward as the reference y_r = nn.Variable.from_numpy_array(y_r) y_i = nn.Variable.from_numpy_array(y_i) # Just create stft inputs x = F.istft(y_r, y_i, window_size, stride, fft_size, window_type, center, pad_mode, True) # Execute istft backward x.need_grad = True x.grad.zero() z_r, z_i = F.stft(x, window_size, stride, fft_size, window_type, center, pad_mode) z_r.g = y_r.d z_i.g = y_i.d z = F.sink(z_r, z_i, one_input_grad=False) z.forward() z.backward() return x.g
def ref_stft(x, window_size, stride, fft_size, window_type, center, pad_mode, as_istft_backward): if not as_istft_backward: # Use librosa.stft as the forward reference. # librosa.stft does not support batched input. window_type = 'hann' if window_type == 'hanning' else window_type b = x.shape[0] ys = [] for i in range(b): y = librosa.stft(x[i], n_fft=fft_size, hop_length=stride, win_length=window_size, window=window_type, center=center, pad_mode=pad_mode) ys.append(y) # Convert to nnabla stft output format ys = np.array(ys) y_r = ys.real y_i = ys.imag return y_r, y_i else: # Use F.istft backward as the reference x = nn.Variable.from_numpy_array(x) # Just create istft inputs y_r, y_i = F.stft(x, window_size, stride, fft_size, window_type, center, pad_mode) # Execute istft backward y_r.need_grad = True y_i.need_grad = True y_r.grad.zero() y_i.grad.zero() z = F.istft(y_r, y_i, window_size, stride, fft_size, window_type, center, pad_mode) z.forward() z.backward(x.data) return y_r.g, y_i.g
def test_stft(ctx, window_size, stride, fft_size, window_type): backend = ctx.backend[0].split(":")[0] if backend == 'cuda': pytest.skip('CUDA Convolution N-D is only supported in CUDNN extension') # clear all previous STFT conv/deconv kernels nn.clear_parameters() # Compare to `scipy.signal.stft` - only done if SciPy available x = np.random.randn(1, window_size * 10) nx = nn.Variable.from_numpy_array(x) with nn.context_scope(ctx): nyr, nyi = F.stft(nx, window_size=window_size, stride=stride, fft_size=fft_size, window_type=window_type, center=False) nn.forward_all([nyr, nyi]) stft_nnabla = nyr.d + 1j * nyi.d window_type_scipy = window_type if window_type == 'rectangular' or window_type is None: window_type_scipy = 'boxcar' _f, _t, stft_scipy = sig.stft(x, window=window_type_scipy, nperseg=window_size, noverlap=window_size-stride, nfft=fft_size, boundary=None, padded=False) # scipy does a different scaling - take care here stft_nnabla /= fft_size // 2 assert(np.allclose(stft_nnabla, stft_scipy, atol=1e-5, rtol=1e-5))
def stft(x, n_fft=4096, n_hop=1024, center=True, patch_length=None): ''' Multichannel STFT Input: (nb_samples, nb_channels, nb_timesteps) Output: (nb_samples, nb_channels, nb_bins, nb_frames), (nb_samples, nb_channels, nb_bins, nb_frames) ''' nb_samples, nb_channels, _ = x.shape x = F.reshape(x, (nb_samples * nb_channels, -1)) real, imag = F.stft(x, n_fft, n_hop, n_fft, window_type='hanning', center=center, pad_mode='reflect') real = F.reshape(real, (nb_samples, nb_channels, n_fft // 2 + 1, -1)) imag = F.reshape(imag, (nb_samples, nb_channels, n_fft // 2 + 1, -1)) if patch_length is not None: # slice 256(patch_length) frames from 259 frames return real[..., :patch_length], imag[..., :patch_length] return real, imag
def STFT(x, n_fft=4096, n_hop=1024, center=True): """Multichannel STFT Input: (nb_samples, nb_channels, nb_timesteps) Output: (nb_samples, nb_channels, nb_bins, nb_frames), (nb_samples, nb_channels, nb_bins, nb_frames) """ nb_samples, nb_channels, _ = x.shape x = F.reshape(x, (nb_samples * nb_channels, -1)) real, imag = F.stft(x, n_fft, n_hop, n_fft, window_type='hanning', center=center, pad_mode='reflect') real = F.reshape(real, (nb_samples, nb_channels, n_fft // 2 + 1, -1)) imag = F.reshape(imag, (nb_samples, nb_channels, n_fft // 2 + 1, -1)) return real, imag