def __init__( self, n_fft=1024, hop_length=256, win_length=1024, n_bins=84, sampling_rate=22050, ): super().__init__() ############################################## # FFT Parameters # ############################################## window = torch.hann_window(win_length).float() cqt_basis, lengths = librosa_cqt_fn(sampling_rate, n_bins=n_bins, filter_scale=0.5) cqt_basis = cqt_basis.astype(dtype=np.float32) cqt_basis = torch.from_numpy(cqt_basis).float() self.register_buffer("cqt_basis", cqt_basis) self.register_buffer("window", window) self.n_fft = n_fft self.n_bins = n_bins self.hop_length = hop_length self.win_length = win_length self.sampling_rate = sampling_rate self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate, n_bins=84, hop_length=hop_length, output_format='Magnitude', pad_mode='constant', device='cuda:0', verbose=False, trainable=False)
def main(): args = parse_args() save_type = args.save_type spec_layer = Spectrogram.CQT1992v2(sr=22050, n_bins=84, hop_length=256, pad_mode='constant', device='cuda:0', verbose=False, trainable=False, output_format='Magnitude') transformedSet = AudioDataset('input_audio.txt', 22050 * 4, sampling_rate=22050, augment=False) transformedLoader = DataLoader(transformedSet, batch_size=1) transformedVoc = [] f = open('input_audio.txt', 'r') lines = f.readlines() lines = list(map(lambda s: s.strip(), lines)) #remove newline character lines = [track.replace('.wav', '') for track in lines] #remove .wav print(lines) if len(lines) != len(transformedLoader): print('Differences in wavs found and whats in input_audio.txt') return for i, x_t in enumerate(transformedLoader): x_t = x_t.cuda() s_t = spec_layer(x_t).detach() s_t = torch.log(torch.clamp(s_t, min=1e-5)) transformedVoc.append(s_t.cuda()) if (save_type == 'torch'): print('Saving WAVs as torch pt') for x in range(0, len(transformedVoc)): torch.save(transformedVoc[x], lines[x] + '.pt') if (save_type == 'png'): print('Saving WAVs as image via matplotlib') for x in range(0, len(transformedVoc)): save_spec_images(transformedVoc[x], lines[x])
def __init__(self): super(Model, self).__init__() f_kernal = 128//network_factor self.STFT_layer = Spectrogram.CQT1992v2(sr=44100, fmin=27.5, n_bins=n_bins, bins_per_octave=bins_per_octave, pad_mode='constant', hop_length=HOP_LENGTH, center=True, device=device) self.freq_cnn1 = torch.nn.Conv2d(1,4, (f_kernal,3), stride=(8,1), padding=1) self.freq_cnn2 = torch.nn.Conv2d(4,8, (f_kernal,3), stride=(8,1), padding=1) shape = self.shape_inference(f_kernal) self.bilstm = torch.nn.LSTM(shape*8, shape*8, batch_first=True, bidirectional=True) self.pitch_classifier = torch.nn.Linear(shape*8*2, 88)
def main(): args = parse_args() file_name = args.input_file save_path = args.save_path spec_layer = Spectrogram.CQT1992v2(sr=22050, n_bins=84, hop_length=256, pad_mode='constant', device='cuda:0', verbose=False, trainable=False, output_format='Magnitude') transformedSet = AudioConversionDataset(file_name, 22050 * 4, sampling_rate=22050, augment=False) transformedLoader = DataLoader(transformedSet, batch_size=1) f = open(file_name, 'r') lines = f.readlines() lines = list(map(lambda s: s.strip(), lines)) #remove newline character lines = [track.replace('.wav', '') for track in lines] #remove .wav lines = [track.split("/")[-1] for track in lines] if len(lines) != len(transformedLoader): print('Differences in wavs found and whats in input_audio.txt') return for i, x in tqdm(enumerate(transformedLoader), ascii=True, desc='Making spectrogram representations'): x_t = x[0] fname = os.path.basename(x[1][0]).replace('.wav', '') x_t = x_t.cuda() s_t = spec_layer(x_t).detach() s_t = torch.log(torch.clamp(s_t, min=1e-5)) if args.save_type == 'pt': torch.save(s_t.cuda(), save_path + fname + '.pt') else: save_image(s_t.cuda(), save_path + fname + '.png', normalize=True) min_value = torch.min(s_t.cuda()).item() max_value = torch.max(s_t.cuda()).item() normalisation_dict[fname] = {"min": min_value, "max": max_value} with open(save_path + 'normalisation_values.json', 'w') as outfile: json.dump(normalisation_dict, outfile, indent=4)
def __init__( self, hop_length=256, n_bins=84, sampling_rate=22050, ): super().__init__() ############################################## # FFT Parameters # ############################################## self.n_bins = n_bins self.hop_length = hop_length self.sampling_rate = sampling_rate self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate, n_bins=84, hop_length=hop_length, output_format='Magnitude', pad_mode='constant', device='cuda:0', verbose=False, trainable=False)
for n_mels in n_mels_ls: if n_mels < n_fft: layer = Spectrogram.MelSpectrogram(n_fft=n_fft, n_mels=n_mels, hop_length=512, verbose=False, device=device) start = time.time() for i in tqdm.tqdm(dataset): i = i.to(device) layer(i) result[f'Mel-{n_fft}-n_bins{n_mels}'] = time.time() - start else: continue # CQT for r in range(1, 11): layer = Spectrogram.CQT1992v2(sr=44100, n_bins=84 * r, bins_per_octave=12 * r, hop_length=512, verbose=False, device=device) start = time.time() for i in tqdm.tqdm(dataset): i = i.to(device) layer(i) result[f'CQT-r={r}'] = time.time() - start pickle.dump(result, open(Path(__file__).parent / './Pytorch_result', 'wb'))
elif args.device == "GPU": device = "cuda:0" print("using GPU") elif args.device == "librosa": print("using librosa") print(Path(__file__).parent / './y_list.npy') y_list = np.load(Path(__file__).parent / './y_list.npy') if args.device in ["CPU", "GPU"]: y_torch = torch.tensor(y_list, device=device).float() spec_layer = Spectrogram.CQT1992v2(sr=44100, n_bins=84, bins_per_octave=24, fmin=55, device=device) timing = [] for e in range(20): t_start = time.time() spec = spec_layer(y_torch[:1000]) spec = spec_layer(y_torch[1000:]) time_used = time.time() - t_start # print(time_used) timing.append(time_used) print("mean = ", np.mean(timing)) print("std = ", np.std(timing)) data = pd.DataFrame(timing, columns=['t_avg'])