def __init__(self, sr=SR, n_fft=NFFT, hop_length=HOP, n_mels=NMEL, n_bins=NBIN, mode='mel'): self.sr = sr self.n_fft = n_fft self.hop_length = hop_length self.n_mels = n_mels self.n_bins = n_bins if mode == 'mel': self.spectrogram = Spectrogram.MelSpectrogram( sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=20, fmax=11000) elif mode == 'cqt': self.spectrogram = Spectrogram.CQT(sr=sr, hop_length=hop_length, fmin=22.5, n_bins=n_bins, bins_per_octave=24, pad_mode='constant')
def wav_to_img(path, save_path, signal_length=None): audio_list = os.listdir(path) for idx, aud in tqdm(enumerate(audio_list), leave=False): #print(aud) #sr, song = wavfile.read(f'drive/My Drive/train_audio/{aud}') song, sr = librosa.load(f'{path}{aud}') song = sklearn.preprocessing.minmax_scale(song, axis=0) #np.amax(np.abs(samples)) #print(x.shape) x = song #print("before",x.shape) #x = np.pad(x,(0,signal_length-x.shape[0])) #print(x.shape) x = torch.tensor(x).to("cuda", dtype=torch.float) spec0 = Spectrogram.MelSpectrogram(n_fft=1024, n_mels=128, trainable_mel=False, trainable_STFT=False, fmin=15, fmax=11000, sr=sr, pad_mode='reflect') spec1 = Spectrogram.STFT(n_fft=1024, hop_length=512, trainable=False) spec2 = Spectrogram.MFCC(sr=22050, n_mfcc=200, norm='ortho', device='cuda:0', verbose=True) x0 = spec0(x) x0 = x0.unsqueeze(1) x1 = spec1(x) x1 = x1[:, :, :, 0].unsqueeze(1) x2 = spec2(x) x2 = x2.unsqueeze(1) x0 = nnf.interpolate(x0, size=(224, 224), mode='bicubic', align_corners=False) x1 = nnf.interpolate(x1, size=(224, 224), mode='bicubic', align_corners=False) x2 = nnf.interpolate(x2, size=(224, 224), mode='bicubic', align_corners=False) img = torch.cat((x0, x1, x2), dim=1) aud1 = aud.replace('.wav', '') save_image(img, f'{save_path}{aud1}.jpeg')
def __init__( self, n_fft=1024, hop_length=256, win_length=1024, n_bins=84, sampling_rate=22050, ): super().__init__() ############################################## # FFT Parameters # ############################################## window = torch.hann_window(win_length).float() cqt_basis, lengths = librosa_cqt_fn(sampling_rate, n_bins=n_bins, filter_scale=0.5) cqt_basis = cqt_basis.astype(dtype=np.float32) cqt_basis = torch.from_numpy(cqt_basis).float() self.register_buffer("cqt_basis", cqt_basis) self.register_buffer("window", window) self.n_fft = n_fft self.n_bins = n_bins self.hop_length = hop_length self.win_length = win_length self.sampling_rate = sampling_rate self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate, n_bins=84, hop_length=hop_length, output_format='Magnitude', pad_mode='constant', device='cuda:0', verbose=False, trainable=False)
def __init__(self): super(Model, self).__init__() f_kernal = 128 // network_factor self.STFT_layer = Spectrogram.MelSpectrogram(sr=44100, n_fft=n_fft, n_mels=n_mels, hop_length=HOP_LENGTH, pad_mode='constant', trainable_mel=True, center=True, device=device) self.freq_cnn1 = torch.nn.Conv2d(1, 4, (f_kernal, 3), stride=(8, 1), padding=1) self.freq_cnn2 = torch.nn.Conv2d(4, 8, (f_kernal, 3), stride=(8, 1), padding=1) shape = self.shape_inference(f_kernal) self.bilstm = torch.nn.LSTM(shape * 8, shape * 8, batch_first=True, bidirectional=True) self.pitch_classifier = torch.nn.Linear(shape * 8 * 2, 88)
def __init__(self, avg=.9998): super(Model, self).__init__() # Getting Mel Spectrogram on the fly self.spec_layer = Spectrogram.STFT(sr=44100, n_fft=n_fft, freq_bins=freq_bins, fmin=50, fmax=6000, freq_scale='log', pad_mode='constant', center=True) self.n_bins = freq_bins # Creating Layers self.CNN_freq_kernel_size = (128, 1) self.CNN_freq_kernel_stride = (2, 1) k_out = 128 k2_out = 256 self.CNN_freq = nn.Conv2d(1, k_out, kernel_size=self.CNN_freq_kernel_size, stride=self.CNN_freq_kernel_stride) self.CNN_time = nn.Conv2d(k_out, k2_out, kernel_size=(1, regions), stride=(1, 1)) self.region_v = 1 + (self.n_bins - self.CNN_freq_kernel_size[0] ) // self.CNN_freq_kernel_stride[0] self.linear = torch.nn.Linear(k2_out * self.region_v, m, bias=False) self.avg = avg
def main(): args = parse_args() save_type = args.save_type spec_layer = Spectrogram.CQT1992v2(sr=22050, n_bins=84, hop_length=256, pad_mode='constant', device='cuda:0', verbose=False, trainable=False, output_format='Magnitude') transformedSet = AudioDataset('input_audio.txt', 22050 * 4, sampling_rate=22050, augment=False) transformedLoader = DataLoader(transformedSet, batch_size=1) transformedVoc = [] f = open('input_audio.txt', 'r') lines = f.readlines() lines = list(map(lambda s: s.strip(), lines)) #remove newline character lines = [track.replace('.wav', '') for track in lines] #remove .wav print(lines) if len(lines) != len(transformedLoader): print('Differences in wavs found and whats in input_audio.txt') return for i, x_t in enumerate(transformedLoader): x_t = x_t.cuda() s_t = spec_layer(x_t).detach() s_t = torch.log(torch.clamp(s_t, min=1e-5)) transformedVoc.append(s_t.cuda()) if (save_type == 'torch'): print('Saving WAVs as torch pt') for x in range(0, len(transformedVoc)): torch.save(transformedVoc[x], lines[x] + '.pt') if (save_type == 'png'): print('Saving WAVs as image via matplotlib') for x in range(0, len(transformedVoc)): save_spec_images(transformedVoc[x], lines[x])
def __init__(self): super(Model, self).__init__() self.spec_layer = Spectrogram.MelSpectrogram( sr=44100, n_mels=156, hop_length=441, window='hann', center=True, pad_mode='constant', fmin=20, fmax=4200, norm=1, trainable_mel=False, trainable_STFT=False).to(device) k_out = 64 self.CNN_freq_kernel_size = (64, 1) self.CNN_freq_kernel_stride = (2, 1) self.CNN_freq = nn.Conv2d(1, k_out, kernel_size=self.CNN_freq_kernel_size, stride=self.CNN_freq_kernel_stride) self.lstm = torch.nn.LSTM(input_size=3008, hidden_size=1024, bias=False, dropout=0.2, batch_first=True) self.linear = torch.nn.Linear(1024, 88, bias=False)
def __init__(self): super(Model, self).__init__() # Getting Mel Spectrogram on the fly self.spec_layer = Spectrogram.MelSpectrogram( sr=44100, n_mels=192, hop_length=441, window='hann', center=True, pad_mode='reflect', fmin=20, fmax=4200, norm=1, trainable_mel=False, trainable_STFT=False).to(device) k_out = 8 k2_out = 16 self.CNN_freq_kernel_size = (64, 1) self.CNN_freq_kernel_stride = (4, 1) self.CNN_freq = nn.Conv2d(1, k_out, kernel_size=self.CNN_freq_kernel_size, stride=self.CNN_freq_kernel_stride, padding=(2, 0)) self.CNN_time = nn.Conv2d(k_out, k2_out, kernel_size=(1, 6), stride=(1, 1), padding=(0, 2)) self.linear = torch.nn.Linear(2992, 748, bias=False) self.linear2 = torch.nn.Linear(748, 88, bias=False)
def __init__(self, n_outputs, kernel_size, sr): super().__init__() self.spec = Spectrogram.STFT( n_fft=2048, freq_bins=81, hop_length=441, window="hann", freq_scale="log", center=True, pad_mode="reflect", fmin=30, fmax=17000, sr=sr, ) self.conv2d_1 = nn.Conv2d( in_channels=1, out_channels=n_outputs, kernel_size=kernel_size, stride=1, padding=0, groups=1, bias=True, padding_mode="zeros", ) self.maxPool2d_1 = nn.MaxPool2d(kernel_size=(1, 3), padding=0, return_indices=False, ceil_mode=False) self.conv2d_2 = nn.Conv2d( in_channels=n_outputs, out_channels=n_outputs, kernel_size=kernel_size, stride=1, padding=0, groups=1, bias=True, padding_mode="zeros", ) self.maxPool2d_2 = nn.MaxPool2d(kernel_size=(1, 3), padding=0, return_indices=False, ceil_mode=False) self.conv2d_3 = nn.Conv2d( in_channels=n_outputs, out_channels=n_outputs, kernel_size=(1, 8), stride=1, padding=0, groups=1, bias=True, padding_mode="zeros", ) self.net = nn.Sequential( self.conv2d_1, self.maxPool2d_1, self.conv2d_2, self.maxPool2d_2, self.conv2d_3, )
def __init__(self): super(MelnnAudio, self).__init__() self.ta = MelSpectrogram(sample_rate=8000) self.nna = nnASpectrogram.MelSpectrogram(sr=8000, n_fft=400, device='cpu', norm=None) self.mask = torch.nn.Parameter(torch.ones([128, 81]))
def __init__(self, sr=32000, n_fft=1600, hop_length=400, n_mels=240, fmin=100, fmax=15000, device=None, clip_length=3000): self.sr = sr self.n_fft = n_fft self.hop_length = hop_length self.n_mels = n_mels self.fmin = fmin self.fmax = fmax self.clip_length = clip_length if device: self.device = device else: self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu' self.spec_layer = Spectrogram.STFT(sr=sr, n_fft=n_fft, hop_length=hop_length).to( self.device) self.spec_mel_layer = Spectrogram.MelSpectrogram(sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, window='hann', center=True, pad_mode='reflect', power=2.0, htk=False, fmin=fmin, fmax=fmax, norm=1, verbose=True).to( self.device) self.rainbow_img = torch.tensor([], dtype=torch.float32, device=self.device) self.model_path = None self.model = None self.names = None self.soundclasses = None
def __init__(self): super(Model, self).__init__() f_kernal = 128//network_factor self.STFT_layer = Spectrogram.CQT1992v2(sr=44100, fmin=27.5, n_bins=n_bins, bins_per_octave=bins_per_octave, pad_mode='constant', hop_length=HOP_LENGTH, center=True, device=device) self.freq_cnn1 = torch.nn.Conv2d(1,4, (f_kernal,3), stride=(8,1), padding=1) self.freq_cnn2 = torch.nn.Conv2d(4,8, (f_kernal,3), stride=(8,1), padding=1) shape = self.shape_inference(f_kernal) self.bilstm = torch.nn.LSTM(shape*8, shape*8, batch_first=True, bidirectional=True) self.pitch_classifier = torch.nn.Linear(shape*8*2, 88)
def __init__(self, sample_rate, n_mels=128, hop_length=512, eps=1e-6, normalize_spectro=True, device=torch.device("cuda:0"), **kwargs): store_attr('sample_rate'), store_attr('n_mels'), store_attr('hop_length') store_attr('eps') super().__init__(**kwargs) self.spectro = Spectrogram.MelSpectrogram( sr=sample_rate, n_mels=n_mels, hop_length=hop_length, verbose=False, **kwargs).to(device) self.relu = nn.ReLU(inplace=True) self.normalize_spectro = normalize_spectro self.eps = eps self.device = device
def __init__(self): config = dict( sr=16000, n_fft=400, n_mels=64, hop_length=160, window='hann', center=False, pad_mode='reflect', htk=True, fmin=125, fmax=7500, ) self.to_spec = Spectrogram.MelSpectrogram(**config)
def main(): args = parse_args() file_name = args.input_file save_path = args.save_path spec_layer = Spectrogram.CQT1992v2(sr=22050, n_bins=84, hop_length=256, pad_mode='constant', device='cuda:0', verbose=False, trainable=False, output_format='Magnitude') transformedSet = AudioConversionDataset(file_name, 22050 * 4, sampling_rate=22050, augment=False) transformedLoader = DataLoader(transformedSet, batch_size=1) f = open(file_name, 'r') lines = f.readlines() lines = list(map(lambda s: s.strip(), lines)) #remove newline character lines = [track.replace('.wav', '') for track in lines] #remove .wav lines = [track.split("/")[-1] for track in lines] if len(lines) != len(transformedLoader): print('Differences in wavs found and whats in input_audio.txt') return for i, x in tqdm(enumerate(transformedLoader), ascii=True, desc='Making spectrogram representations'): x_t = x[0] fname = os.path.basename(x[1][0]).replace('.wav', '') x_t = x_t.cuda() s_t = spec_layer(x_t).detach() s_t = torch.log(torch.clamp(s_t, min=1e-5)) if args.save_type == 'pt': torch.save(s_t.cuda(), save_path + fname + '.pt') else: save_image(s_t.cuda(), save_path + fname + '.png', normalize=True) min_value = torch.min(s_t.cuda()).item() max_value = torch.max(s_t.cuda()).item() normalisation_dict[fname] = {"min": min_value, "max": max_value} with open(save_path + 'normalisation_values.json', 'w') as outfile: json.dump(normalisation_dict, outfile, indent=4)
def __init__( self, hop_length=256, n_bins=84, sampling_rate=22050, ): super().__init__() ############################################## # FFT Parameters # ############################################## self.n_bins = n_bins self.hop_length = hop_length self.sampling_rate = sampling_rate self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate, n_bins=84, hop_length=hop_length, output_format='Magnitude', pad_mode='constant', device='cuda:0', verbose=False, trainable=False)
def melspectrogram(wav, sample_rate=22050, n_fft=2048, n_mels=128, hop_length=512, window='hann', center=True, pad_mode='reflect', power=2.0, htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False, verbose=False, eps=1e-6, cuda=False, log=True, **kwargs): s = Spectrogram.MelSpectrogram(sr=sample_rate, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, window=window, center=center, pad_mode=pad_mode, power=power, htk=htk, fmin=fmin, fmax=fmax, norm=norm, trainable_mel=trainable_mel, trainable_STFT=trainable_STFT, verbose=verbose, **kwargs) if cuda: s = s.cuda() return torch.log(s(wav) + eps) if log else s(wav)
def __init__(self, roll=False, return_cqt=False, output=None, drop='last'): super(PyTorch, self).__init__() self.cqt = Spectrogram.CQT2010v2( sr=44100, hop_length=64, n_bins=84*10, bins_per_octave=12*10, norm=1, window='hann', pad_mode='constant', trainable=False, ) self.roll = roll self.drop = drop self.return_cqt = return_cqt self.output = output self.bn0 = nn.BatchNorm1d(840) self.conv1 = nn.Conv1d(840, 1024, 1) self.bn1 = nn.BatchNorm1d(1024) self.conv2 = nn.Conv1d(1024, 512, 1) self.bn2 = nn.BatchNorm1d(512) self.conv3 = nn.Conv1d(512, 256, 1) self.bn3 = nn.BatchNorm1d(256) self.conv4 = nn.Conv1d(256, 48, 1) self.bn4 = nn.BatchNorm1d(48) self.LSTM = nn.LSTM(48, 512, batch_first=True, num_layers=2, dropout=0.25) self.conv5 = nn.Conv1d(512, 48, 1) self.dropout_1 = nn.Dropout(p=0.25) self.dropout_2 = nn.Dropout(p=0.25) self.dropout_3 = nn.Dropout(p=0.25) self.dropout_4 = nn.Dropout(p=0.25)
import nnAudio.Spectrogram as Spec from plots import plot_cqt from parameters import * if USE_CQT: cqt_layer = Spec.CQT(sr=FS, hop_length=HOP_LENGTH, fmin=F_MIN, n_bins=N_BINS, bins_per_octave=BINS_PER_OCTAVE, norm=NORM, pad_mode='constant', window=WINDOW) cqt_layer.to(DEVICE) def cqt(signal, numpy=True, db=True): time_array = np.arange(np.ceil( signal.size / HOP_LENGTH).astype(int)) / (FS / HOP_LENGTH) signal_tensor = torch.tensor(signal, device=DEVICE, dtype=torch.float) cqt_tensor = cqt_layer(signal_tensor, normalization_type='wrap') if db: cqt_tensor = 20 * torch.log10(cqt_tensor + EPS) if numpy: cqt_array = cqt_tensor.cpu().numpy()[0, :, :] torch.cuda.empty_cache() return cqt_array, time_array else:
def __init__(self, input_repr, embedding_size, weight_file): super(PytorchOpenl3, self).__init__() self.__weights_dict = load_weights(weight_file) self.AUDIO_POOLING_SIZES = { "mel128": { 512: (16, 24), 6144: (4, 8) }, "mel256": { 512: (32, 24), 6144: (8, 8) }, } if input_repr == 'mel128': self.speclayer = Spectrogram.MelSpectrogram(sr=48000, n_fft=2048, n_mels=128, hop_length=242, power=1.0, htk=True) else: self.speclayer = Spectrogram.MelSpectrogram(sr=48000, n_fft=2048, n_mels=256, hop_length=242, power=1.0, htk=True) self.input_repr = input_repr self.embedding_size = embedding_size self.batch_normalization_1 = self.__batch_normalization( 2, "batch_normalization_1", num_features=1, eps=0.001, momentum=0.99) self.conv2d_1 = self.__conv( 2, name="conv2d_1", in_channels=1, out_channels=64, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, ) self.batch_normalization_2 = self.__batch_normalization( 2, "batch_normalization_2", num_features=64, eps=0.001, momentum=0.99) self.conv2d_2 = self.__conv( 2, name="conv2d_2", in_channels=64, out_channels=64, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, ) self.batch_normalization_3 = self.__batch_normalization( 2, "batch_normalization_3", num_features=64, eps=0.001, momentum=0.99) self.conv2d_3 = self.__conv( 2, name="conv2d_3", in_channels=64, out_channels=128, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, ) self.batch_normalization_4 = self.__batch_normalization( 2, "batch_normalization_4", num_features=128, eps=0.001, momentum=0.99) self.conv2d_4 = self.__conv( 2, name="conv2d_4", in_channels=128, out_channels=128, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, ) self.batch_normalization_5 = self.__batch_normalization( 2, "batch_normalization_5", num_features=128, eps=0.001, momentum=0.99) self.conv2d_5 = self.__conv( 2, name="conv2d_5", in_channels=128, out_channels=256, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, ) self.batch_normalization_6 = self.__batch_normalization( 2, "batch_normalization_6", num_features=256, eps=0.001, momentum=0.99) self.conv2d_6 = self.__conv( 2, name="conv2d_6", in_channels=256, out_channels=256, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, ) self.batch_normalization_7 = self.__batch_normalization( 2, "batch_normalization_7", num_features=256, eps=0.001, momentum=0.99) self.conv2d_7 = self.__conv( 2, name="conv2d_7", in_channels=256, out_channels=512, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, ) self.batch_normalization_8 = self.__batch_normalization( 2, "batch_normalization_8", num_features=512, eps=0.001, momentum=0.99) self.audio_embedding_layer = self.__conv( 2, name="audio_embedding_layer", in_channels=512, out_channels=512, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True, )
from nnAudio import Spectrogram from scipy.io import wavfile import torch sr, song = wavfile.read('D:/210.wav') # Loading your audio x = song.mean(1) # Converting Stereo to Mono x = torch.tensor(x).float() # casting the array into a PyTorch Tensor spec_layer = Spectrogram.STFT(n_fft=2048, freq_bins=None, hop_length=512, window='hann', freq_scale='linear', center=True, pad_mode='reflect', fmin=50, fmax=11025, sr=sr) # Initializing the model spec = spec_layer(x) print(spec)
def __init__(self, cfg, x, output_dim=1): super(SpecMod1, self).__init__() self.cfg = cfg ## add magnitude? if cfg.add_mag: x = add_magnitude(x) print(x.shape) n_fft = 32 ## hop_length = n_fft//4 out_channels = 64 kernels = [3,3,3,3] strides = [2,2,2,2] lins = [32] trainable = False spec_layer_1 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable) spec_layer_2 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable) self.spec_layers = nn.ModuleList([spec_layer_1, spec_layer_2]) x1 = apply_stft(spec_layer_1, x[:, :x.shape[1]//2, :]) x2 = apply_stft(spec_layer_2, x[:, x.shape[1]//2:, :]) print(x1.shape) x = torch.cat((x1,x2),1) print(x.shape) self.drop = torch.nn.Dropout(p=cfg.dropout) # self.drop = torch.nn.Dropout(p=0.2) bs, in_channels, h, w = x.shape i = 0 conv_layers, lin_layers = [],[] conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=1) conv_layers.append(conv) x = conv(x) print(x.shape) i, in_channels, out_channels = i+1, out_channels, out_channels*2 conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=1) conv_layers.append(conv) x = conv(x) print(x.shape) i, in_channels, out_channels = i+1, out_channels, out_channels*2 conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=1) conv_layers.append(conv) x = conv(x) print(x.shape) i, in_channels, out_channels = i+1, out_channels, out_channels*2 conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=0) conv_layers.append(conv) x = conv(x) print(x.shape) self.conv_layers = nn.ModuleList(conv_layers) x = x.view(bs, -1) self.n = x.shape[1] print(x.shape) n1 = self.n for n2 in lins: lin_layer = nn.Linear(in_features=n1, out_features=n2) lin_layers.append(lin_layer) x = lin_layer(x) print(x.shape) n1 = n2 ## final lin layer lin_layer = nn.Linear(in_features=n1, out_features=output_dim) lin_layers.append(lin_layer) x = lin_layer(x) print(x.shape) self.lin_layers = nn.ModuleList(lin_layers)
def __init__(self, cfg, x, output_dim=1): super(SpecMod2, self).__init__() self.cfg = cfg self.drop = torch.nn.Dropout(p=cfg.dropout) # cnn = [(128,64), (32,8), (3,2), (3,2), (3,2), (3,2), 32] K = cfg.cnn win, out_channels = K[0] ## add magnitude? if cfg.add_mag: x = add_magnitude(x) print(x.shape) bs, in_channels, _ = x.shape n_fft = K[1][0] ## 32 hop_length = K[1][1] ## n_fft//4 m = sum([isinstance(x, int) for x in K]) if m==0: kernels, strides = zip(*K[2:]) lins = [] else: kernels, strides = zip(*K[2:-m]) lins = K[-m:] pads = np.array(kernels)*0 +1 # pads = np.zeros(len(kernels)) +1 pads[-1]=0 trainable = False spec_layer_1 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable) spec_layer_2 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable) self.spec_layers = nn.ModuleList([spec_layer_1, spec_layer_2]) x1 = apply_stft(spec_layer_1, x[:, :x.shape[1]//2, :]) x2 = apply_stft(spec_layer_2, x[:, x.shape[1]//2:, :]) print(x1.shape) x = torch.cat((x1,x2),1) print(x.shape) i = 0 conv_layers, lin_layers = [],[] for k,s,p in zip(kernels, strides, pads): # p = (0,0) conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=k, stride=s, padding=p) conv_layers.append(conv) x = conv(x) print(x.shape) in_channels, out_channels = out_channels, out_channels*2 x = x.view(bs, -1) self.n = x.shape[1] print(x.shape) n1 = self.n for n2 in lins: lin_layer = nn.Linear(in_features=n1, out_features=n2) lin_layers.append(lin_layer) x = lin_layer(x) print(x.shape) n1 = n2 ## final lin layer lin_layer = nn.Linear(in_features=n1, out_features=output_dim) lin_layers.append(lin_layer) x = lin_layer(x) print(x.shape) self.conv_layers = nn.ModuleList(conv_layers) self.lin_layers = nn.ModuleList(lin_layers)
audio_name = self.file_list[idx] sr, wav = wavfile.read(audio_name) return wav if __name__ == '__main__': dataset = MusicNet() dataset = DataLoader(dataset, shuffle=False, num_workers=8) result = {} # STFT n_fft_ls = [256, 512, 1024, 2048, 4096] for n_fft in n_fft_ls: layer = Spectrogram.STFT(n_fft=n_fft, hop_length=512, verbose=False, device=device) start = time.time() for i in tqdm.tqdm(dataset): i = i.to(device) layer(i) result[f'STFT_{n_fft}'] = time.time() - start n_fft_ls = [256, 512, 1024, 2048, 4096] for n_fft in n_fft_ls: layer = Spectrogram.STFT(n_fft=n_fft, hop_length=512, freq_scale='log', sr=44100, fmin=1, fmax=22050,
device = "cpu" print("using CPU") elif args.device in ["GPU", "torchaudio", 'tensorflow']: device = f"cuda:0" print("using GPU") elif args.device == "librosa": print("using librosa") y_list = np.load(Path(__file__).parent / './y_list.npy') if args.device in ["CPU", "GPU"]: import torch import torch.nn as nn from nnAudio import Spectrogram y_torch = torch.tensor(y_list, device=device).float() spec_layer = Spectrogram.STFT(device=device) timing = [] for e in range(20): t_start = time.time() spec = spec_layer(y_torch[:1000]) spec = spec_layer(y_torch[1000:]) time_used = time.time() - t_start # print(time_used) timing.append(time_used) print("mean = ", np.mean(timing)) print("std = ", np.std(timing)) data = pd.DataFrame(timing, columns=['t_avg']) data['Type'] = f'torch_{args.device}' print('saving file') data.to_csv(
test_s_dl = DataLoader(test_s_ds, batch_size=args["batch_size"], shuffle=False, num_workers=0) # load model model = NMSLatentDisentangledDynamic(input_dims=MELSPEC_DIM, hidden_dims=args["hidden_dims"], z_dims=args["z_dims"], n_component=NUM_EMOTIONS) model.cuda() optimizer = optim.Adam(model.parameters(), lr=args['lr'], betas=(0.9, 0.98), eps=1e-9) wav_to_melspec = Spectrogram.MelSpectrogram(sr=16000, n_mels=MELSPEC_DIM, hop_length=args['hop_size']) normalizer = Normalizer(mode="imagewise") # load writers current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') train_log_dir = 'logs/' + args[ 'name'] + '_dynamic_v2/' + current_time + '/train' eval_log_dir = 'logs/' + args[ 'name'] + '_dynamic_v2/' + current_time + '/eval' train_sup_writer = SummaryWriter(train_log_dir + "_sup") eval_sup_writer = SummaryWriter(eval_log_dir + "_sup") training()
print("using CPU") elif args.device in ["GPU", "torchaudio", 'tensorflow']: device = f"cuda:0" print("using GPU") elif args.device == "librosa": print("using librosa") y_list = np.load(Path(__file__).parent / './y_list.npy') if args.device in ["CPU", "GPU"]: import torch import torch.nn as nn from nnAudio import Spectrogram y_torch = torch.tensor(y_list, device=device).float() spec_layer = Spectrogram.MelSpectrogram(sr=44100, device=device) timing = [] for e in range(20): t_start = time.time() spec = spec_layer(y_torch[:1000]) spec = spec_layer(y_torch[1000:]) time_used = time.time() - t_start # print(time_used) timing.append(time_used) print("mean = ", np.mean(timing)) print("std = ", np.std(timing)) data = pd.DataFrame(timing, columns=['t_avg']) data['Type'] = f'torch_{args.device}' data.to_csv(Path(__file__).parent / f'./result/Mel_torch_{args.device}')
def __init__(self, cfg, x, output_dim=1): super(CNN_12b, self).__init__() self.cfg = cfg if cfg.gpu_preproc: #################### x = preproc(x, cfg) #################### # ''' # print(x.shape) # spec_layer = Spectrogram.STFT(n_fft=24, # # freq_bins=None, # hop_length=14, # # window='hann', # freq_scale='no', # # center=True, # # pad_mode='reflect', # # fmin=50, # # fmax=6000, # # sr=22050, # # trainable=False, # # output_format='Magnitude', # # device='cuda:0' # ) if self.cfg.feats_fft: n_fft = 24 ## 24 32 64 hop_length = n_fft//2 + 2 trainable = False spec_layer_1 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable) spec_layer_2 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable) x1 = apply_spec(spec_layer_1, x[:, :x.shape[1]//2, :]) x2 = apply_spec(spec_layer_2, x[:, x.shape[1]//2:, :]) f = torch.cat((x1,x2),1) self.spec_layers = nn.ModuleList([spec_layer_1, spec_layer_2]) if cfg.feats_raw: x = torch.cat((x, f), 1) else: x = f self.c = c = x.shape[1]//2 x = x[:, :c, :] print(x.shape) K = cfg.cnn win,f = K[0] fin,fout = c,f i, conv_layers_1, conv_layers_2 = 1,[],[] while isinstance(K[i], (list, tuple)): k,i = K[i],i+1 conv1 = get_conv_layer(k, fin, fout) conv2 = get_conv_layer(k, fin, fout) conv_layers_1.append(conv1) conv_layers_2.append(conv2) if k[0]>0: fin,fout = fout,fout*2 else: fin,fout = fout//2,fout x = conv1(x) print(x.shape) #print('fin={},fout={}'.format(fin, fout)) f = fin self.f = f x = x.view(-1, f) print(x.shape) merged = False n1 = f lin_layers_1, lin_layers_2, lin_layers = [],[],[] while i<len(K): n2,i = K[i],i+1 if n2<0 and not merged: x = torch.cat((x,x),1) print(x.shape) n1 = n1*2 n2 = abs(n2) merged = True if merged: lin = torch.nn.Linear(n1, n2) lin_layers.append(lin) x = lin(x) else: lin1 = torch.nn.Linear(n1, n2) lin_layers_1.append(lin1) lin2 = torch.nn.Linear(n1, n2) lin_layers_2.append(lin2) x = lin1(x) n1 = n2 print(x.shape) if not merged: n1 = n1*2 x = torch.cat((x,x),1) print(x.shape) lin = torch.nn.Linear(n1, output_dim) lin_layers.append(lin) x = lin(x) print(x.shape) self.conv_layers_1 = nn.ModuleList(conv_layers_1) self.conv_layers_2 = nn.ModuleList(conv_layers_2) self.lin_layers_1 = nn.ModuleList(lin_layers_1) self.lin_layers_2 = nn.ModuleList(lin_layers_2) self.lin_layers = nn.ModuleList(lin_layers) drop_layers = [] if not isinstance(cfg.dropout, (list, tuple)): drop_layers.append(nn.Dropout(p=cfg.dropout)) else: for drop in cfg.dropout: drop_layers.append(nn.Dropout(p=drop)) self.drop_layers = nn.ModuleList(drop_layers)
print("Using CPU") device = "cpu" aug = Augment(Chords()) config = yaml.load(open("./config/config.yaml")) sr = config['preprocess']['sample_rate'] hop_size = config['preprocess']['hop_size'] window_size = config['preprocess']['window_size'] song_hz = config['preprocess']['song_hz'] save_dir = config['preprocess']['save_dir'] cqt_layer = Spectrogram.CQT(device=device, sr=sr, hop_length=hop_size, fmin=220, fmax=None, n_bins=108, bins_per_octave=24, norm=1, window='hann', center=True, pad_mode='reflect') p = Preprocess(sr, hop_size, song_hz, window_size, save_dir, aug, cqt_layer) num_epochs = config['model'].get('num_epochs') def get_data(): datasets = { "isophonics-beetles": { "mp3": config['preprocess']['data_path'] + "/beetles_albums", "labels":
elif args.device == "GPU": device = "cuda:0" print("using GPU") elif args.device == "librosa": print("using librosa") print(Path(__file__).parent / './y_list.npy') y_list = np.load(Path(__file__).parent / './y_list.npy') if args.device in ["CPU", "GPU"]: y_torch = torch.tensor(y_list, device=device).float() spec_layer = Spectrogram.CQT1992v2(sr=44100, n_bins=84, bins_per_octave=24, fmin=55, device=device) timing = [] for e in range(20): t_start = time.time() spec = spec_layer(y_torch[:1000]) spec = spec_layer(y_torch[1000:]) time_used = time.time() - t_start # print(time_used) timing.append(time_used) print("mean = ", np.mean(timing)) print("std = ", np.std(timing)) data = pd.DataFrame(timing, columns=['t_avg'])