def __init__(self, num_srcs, n_fft, hop_length, win_length, window, center): self.num_srcs = num_srcs self.n_fft = n_fft self.hop_length = hop_length self.win_length = win_length if window == 'hann': self.window = torch.hann_window(win_length).cuda() self.center = center self.loss = PITLossWrapper(PairwiseNegSDR("sisdr"), pit_from="pw_mtx")
def kernel_downsample2(zeros=56): """kernel_downsample2. """ win = th.hann_window(4 * zeros + 1, periodic=False) winodd = win[1::2] t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros) t.mul_(math.pi) kernel = (sinc(t) * winodd).view(1, 1, -1) return kernel
def __init__(self, filter_size, block_size): super(Generator, self).__init__() self.apply(self.init_parameters) self.block_size = block_size self.filter_size = filter_size self.noise_att = 1e-4 self.filter_window = nn.Parameter(torch.hann_window(filter_size).roll( filter_size // 2, -1), requires_grad=False) self.filter_coef = None
def hann_filter(kernel_size: int) -> torch.Tensor: r"""Creates Hann kernel Returns: kernel: Tensor with shape (1, kernel_size, kernel_size) """ # Take bigger window and drop borders window = torch.hann_window(kernel_size + 2, periodic=False)[1:-1] kernel = window[:, None] * window[None, :] # Normalize and reshape kernel return kernel.view(1, kernel_size, kernel_size) / kernel.sum()
def stft(self, audio): ''' wrapper around th.stft audio: wave signal as th.Tensor ''' hann = th.hann_window(self.win_length) hann = hann.cuda() if audio.is_cuda else hann spec = th.stft(audio, n_fft=self.fft_bins, hop_length=self.hop_length, win_length=self.win_length, window=hann, center=not self.causal, normalized=self.normalized) return spec.contiguous()
def __init__(self, size): super(InstantaneousFrequency, self).__init__() self.size = size self.hop = size //4 self.window = nn.Parameter(torch.hann_window(size)) freq_angular = np.linspace(0, 2 * np.pi, size, endpoint=False) d_window = np.sin(-freq_angular) * np.pi / size self.d_window = nn.Parameter(torch.from_numpy(d_window).float())
def istft(self, x): return torch.istft(x, n_fft=self.n_fft, hop_length=self.n_fft // 4, win_length=self.n_fft, center=True, normalized=False, onesided=True, window=torch.hann_window(self.n_fft).to(x.device), length=(x.size(2) - 1) * self.n_fft // 4)
def stft(self, x): return torch.stft(x, n_fft=self.n_fft, hop_length=self.n_fft // 4, win_length=self.n_fft, center=True, normalized=False, onesided=True, pad_mode='reflect', window=torch.hann_window(self.n_fft).to(x.device))
def file_log_spectrogram(sound, segment_time=20, overlap_time=10): r"""Generates a spectrogram of a given sound file. """ waveform, fs = torchaudio.load(sound) nperseg = int(segment_time * fs / 1000) # TODO: do not hardcode these noverlap = int(overlap_time * fs / 1000) cur_input = torch.log( F.spectrogram(waveform, 0, torch.hann_window(nperseg), nperseg, nperseg - noverlap, nperseg, 2, 0) + 1e-10) return torch.squeeze(torch.transpose(cur_input, 1, 2))
def __init__(self, n_fft=4096, n_hop=1024, center=False, window=None): super(TorchSTFT, self).__init__() if window is not None: self.window = nn.Parameter(torch.hann_window(n_fft), requires_grad=False) else: self.window = window self.n_fft = n_fft self.n_hop = n_hop self.center = center
def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help='Input mixture .wav file\nIf input contains ' 'more than one channel, ch0 will be used', type=str) parser.add_argument('output', help='Output path of separated .wav file', type=str) parser.add_argument('--model', '-m', help='Trained model', type=str, metavar='PATH', required=True) parser.add_argument('--gpu', '-g', help='GPU id (Negative number indicates CPU)', type=int, metavar='ID', default=-1) args = parser.parse_args() if_use_cuda = torch.cuda.is_available() and args.gpu >= 0 device = torch.device(f'cuda:{args.gpu}' if if_use_cuda else 'cpu') with torch.no_grad(): sound, _ = torchaudio.load(args.input) sound = sound[[0], :].to(device) window = torch.hann_window(N_FFT, device=device) # Convert it to power spectrogram, and pad it to make the number of # time frames to a multiple of 64 to be fed into U-NET sound_stft = torch.stft(sound, N_FFT, window=window) sound_spec = sound_stft.pow(2).sum(-1).sqrt() sound_spec, (left, right) = padding(sound_spec) # Load the model model = UNet(N_PART) model.load_state_dict(torch.load(args.model)) model.to(device) model.eval() right = sound_spec.size(2) - right mask = model(sound_spec).squeeze(0)[:, :, left:right] separated = mask.unsqueeze(3) * sound_stft separated = torch.istft(separated, N_FFT, window=window, length=sound.size(-1)) separated = separated.cpu().numpy() # Save the separated signals sf.write(args.output, separated.T, SAMPLING_RATE)
def istft(magnitude, phase, config): window = torch.hann_window(config.win_size) stft_matrix = torch.stack((magnitude*torch.cos(phase), magnitude*torch.sin(phase)), dim=-1) stft_matrix, window = set_device((stft_matrix, window), config.device) y = torchaudio.functional.istft(stft_matrix, n_fft=config.fft_size, hop_length=config.hop_size, win_length=config.win_size, window=window) return y
def __init__(self, n_fft, hop_length, center=True): # n_fft: resolution on freq axis self.n_fft = n_fft self.hop_length = hop_length # resolution on time axis; width of hann window/4; overlap shd add up to 1 # center - true: t-th frame in spectrogram is centered at time t x hop_length of the signal # orcaspot: center=False # --> create 1-to-1 correspondence # done by reflected padding (padding on both sides) self.center = center # hann window: more weight on 'current' freqs at time t (weighting functions/weight matrix used in FFT analysis) # window functions control the amount of signal leakage between freq bins of FFT self.window = torch.hann_window(self.n_fft)
def spectrogram(wav, hparams): stft = torch.stft( wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size, window=torch.hann_window(hparams.win_size).cuda() ) power = (stft ** 2).sum(dim=-1) log_spec = 10. * torch.log10(torch.clamp(power / power.max(), 1e-10)) return torch.max(log_spec, log_spec.max() - hparams.top_db)
def get_power_loss(y, y1, frame_length=1024, hop_length=256): batch = y.size(0) x = y.view(batch, -1) x1 = y1.view(batch, -1) window = torch.hann_window(frame_length, periodic=True) if use_cuda: window = window.cuda() s = torch.stft(x, frame_length=frame_length, hop=hop_length, window=window) s1 = torch.stft(x1, frame_length=frame_length, hop=hop_length, window=window) ss = torch.log(torch.sqrt(torch.sum(s ** 2, -1) + 1e-5)) - torch.log(torch.sqrt(torch.sum(s1 ** 2, -1) + 1e-5)) return torch.sum(ss ** 2) / batch
def func(tensor): n_fft = 400 ws = 400 hop = 200 window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype) power = 2. momentum = 0.99 n_iter = 32 length = 1000 rand_int = False return F.griffinlim(tensor, window, n_fft, hop, ws, power, n_iter, momentum, length, rand_int)
def stft(y, scale='linear'): D = torch.stft(y, n_fft=1024, hop_length=256, win_length=1024, window=torch.hann_window(1024).cuda()) D = torch.sqrt(D.pow(2).sum(-1) + 1e-10) # D = torch.sqrt(torch.clamp(D.pow(2).sum(-1), min=1e-10)) if scale == 'linear': return D elif scale == 'log': S = 2 * torch.log(torch.clamp(D, 1e-10, float("inf"))) return S else: pass
def __getitem__(self, index): if not os.path.exists(self.save_path): os.makedirs(self.save_path) win_len = int(1024 * (fs / 16)) window = torch.hann_window(window_length=win_len, periodic=True, dtype=None, layout=torch.strided, device=None, requires_grad=False) tgt_item = self.tgt_paths[index] if self.tgt_paths is not None else None tgt_wav, _ = torchaudio.load(tgt_item) noi_item = self.noi_paths[index] if self.noi_paths is not None else None noi_wav, _ = torchaudio.load(noi_item) tgt_wav_len = tgt_wav.shape[1] spec_tgt = torchaudio.functional.spectrogram(waveform=tgt_wav, pad=0, window=window, n_fft=win_len, hop_length=int(win_len / 4), win_length=win_len, power=None, normalized=False) spec_noi = torchaudio.functional.spectrogram(waveform=noi_wav, pad=0, window=window, n_fft=win_len, hop_length=int(win_len / 4), win_length=win_len, power=None, normalized=False) tgt_wav_real = spec_tgt[0, :, :, 0] tgt_wav_imag = spec_tgt[0, :, :, 1] input_wav_real = spec_noi[0, :, :, 0] input_wav_imag = spec_noi[0, :, :, 1] num = index batch_dict = { "id": index, "tgt_wav_len": tgt_wav_len, "audio_wav": [noi_wav, tgt_wav], "audio_data_Real": [input_wav_real, tgt_wav_real], "audio_data_Imagine": [input_wav_imag, tgt_wav_imag] } with open(self.save_path + '/' + str(num) + '.pkl', 'wb') as f: pickle.dump(batch_dict, f) return index
def __init__(self, sampling_rate: int = 22050, n_fft: int = 1024, window_size: int = 1024, hop_size: int = 256, num_mels: int = 80, fmin: float = 0., fmax: float = 8000.): super().__init__() self.n_fft = n_fft self.hop_size = hop_size self.window_size = window_size self.pad_size = (self.n_fft - self.hop_size) // 2 mel_filter_tensor = torch.FloatTensor(mel(sampling_rate, n_fft, num_mels, fmin, fmax)) self.register_buffer('mel_filter', mel_filter_tensor) self.register_buffer('window', torch.hann_window(window_size))
def func(tensor): sample_rate = 44100 n_fft = 400 ws = 400 hop = 200 pad = 0 window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype) return F.spectral_centroid(tensor, sample_rate, pad, window, n_fft, hop, ws)
def _feature_window_function(window_type, window_size, blackman_coeff): r"""Returns a window function with the given type and size """ if window_type == HANNING: return torch.hann_window(window_size, periodic=False) elif window_type == HAMMING: return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46) elif window_type == POVEY: # like hanning but goes to zero at edges return torch.hann_window(window_size, periodic=False).pow(0.85) elif window_type == RECTANGULAR: return torch.ones(window_size) elif window_type == BLACKMAN: a = 2 * math.pi / (window_size - 1) window_function = torch.arange(window_size) # can't use torch.blackman_window as they use different coefficients return blackman_coeff - 0.5 * torch.cos(a * window_function) + \ (0.5 - blackman_coeff) * torch.cos(2 * a * window_function) else: raise Exception('Invalid window type ' + window_type)
def test_spectrogram(self): tensor = torch.rand((1, 1000)) n_fft = 400 ws = 400 hop = 200 pad = 0 window = torch.hann_window(ws) power = 2 normalize = False _test_torchscript_functional(F.spectrogram, tensor, pad, window, n_fft, hop, ws, power, normalize)
def func(tensor): n_fft = 400 ws = 400 hop = 200 pad = 0 window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype) power = None normalize = False return F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize)
def test_linearity_of_istft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized': False, 'onesided': False, } data_size = (2, 12, 7, 2) self._test_linearity_of_istft(data_size, kwargs2)
def __init__(self, fft_size=32, win_size=20, hop_size=10, logratio=0.0, device="cuda"): super(STFTLoss, self).__init__() self.fft_size = fft_size self.win_size = win_size self.hop_size = hop_size self.logratio = logratio self.window = torch.hann_window(win_size).to(device)
def spectral_ops(self): a = torch.randn(10) b = torch.randn(10, 8, 4, 2) return ( torch.stft(a, 8), torch.istft(b, 8), torch.bartlett_window(2, dtype=torch.float), torch.blackman_window(2, dtype=torch.float), torch.hamming_window(4, dtype=torch.float), torch.hann_window(4, dtype=torch.float), torch.kaiser_window(4, dtype=torch.float), )
def forward(self, student_hat, y): device = self.device batch_size = student_hat.size(0) student_hat = student_hat.view(batch_size, -1) y = y.view(batch_size, -1) # window = torch.hann_window(1024, periodic=True).to(device) # # we need to get the magnitudes after stft # student_stft = torch.stft(student_hat, frame_length=hparams.fft_size, hop=hparams.hop_size, window=window) # y_stft = torch.stft(y, frame_length=hparams.fft_size, hop=hparams.hop_size, window=window) WIN_SIZE = 1200 window1 = torch.hann_window(WIN_SIZE, periodic=True).to(device) window_pad = int((WIN_SIZE - 512) / 2) window2 = window1[window_pad:window_pad + 512] freq = int(3000 / (self.sample_rate * 0.5) * 1025) # we use fft size 2048 for frequence lower than 3000hz student_stft = torch.stft(student_hat, win_length=WIN_SIZE, hop_length=300, n_fft=2048, window=window1)[:, :freq, :, :] y_stft = torch.stft(y, win_length=WIN_SIZE, hop_length=300, n_fft=2048, window=window1)[:, :freq, :, :] student_magnitude = self.get_magnitude(student_stft) y_magnitude = self.get_magnitude(y_stft) loss = torch.pow( torch.norm(torch.abs(student_magnitude) - torch.abs(y_magnitude), p=2, dim=1), 2) freq1 = int(3000 / (self.sample_rate * 0.5) * 257) student_stft1 = torch.stft(student_hat, win_length=window2.size(0), hop_length=300, n_fft=512, window=window2)[:, freq1:, :, :] y_stft1 = torch.stft(y, win_length=window2.size(0), hop_length=300, n_fft=512, window=window2)[:, freq1:, :, :] student_magnitude1 = self.get_magnitude(student_stft1) y_magnitude1 = self.get_magnitude(y_stft1) loss1 = torch.pow( torch.norm(torch.abs(student_magnitude1) - torch.abs(y_magnitude1), p=2, dim=1), 2) return torch.mean(loss, dim=1) + 10 * torch.mean(loss1, dim=1)
def inv_f(self, input, phase): input = torch.stack([input * torch.cos(phase), input * torch.sin(phase)], dim=-1) input = istft( input, n_fft=self.num_fft, hop_length=self.hop_length, win_length=self.win_length, window=torch.hann_window(self.win_length, device=input.device), ) return input
def __init__(self, stft_params, device): self.device = device self.dtype = torch.float32 self.n_fft = stft_params['n_fft'] self.hop_length = stft_params['hop_length'] self.win_length = stft_params['win_length'] self.window = torch.hann_window(self.win_length).to(self.dtype).to( self.device) self.freq_num = self._cal_freq_num() self.pad = None self.pad_len = None self.sample_len = None
def __init__(self, fft_size, hop_size, win_size): super(STFTLoss, self).__init__() self.fft_size = fft_size self.hop_size = hop_size self.win_size = win_size self.window = torch.hann_window(win_size) self.sc_loss = SpectralConvergence() self.mag_loss = LogSTFTMagnitude()