Example #1
0
    def __init__(self,
                 sr=SR,
                 n_fft=NFFT,
                 hop_length=HOP,
                 n_mels=NMEL,
                 n_bins=NBIN,
                 mode='mel'):
        self.sr = sr
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.n_bins = n_bins

        if mode == 'mel':
            self.spectrogram = Spectrogram.MelSpectrogram(
                sr=sr,
                n_fft=n_fft,
                n_mels=n_mels,
                hop_length=hop_length,
                fmin=20,
                fmax=11000)
        elif mode == 'cqt':
            self.spectrogram = Spectrogram.CQT(sr=sr,
                                               hop_length=hop_length,
                                               fmin=22.5,
                                               n_bins=n_bins,
                                               bins_per_octave=24,
                                               pad_mode='constant')
Example #2
0
def wav_to_img(path, save_path, signal_length=None):
    audio_list = os.listdir(path)
    for idx, aud in tqdm(enumerate(audio_list), leave=False):
        #print(aud)
        #sr, song = wavfile.read(f'drive/My Drive/train_audio/{aud}')
        song, sr = librosa.load(f'{path}{aud}')
        song = sklearn.preprocessing.minmax_scale(song, axis=0)
        #np.amax(np.abs(samples))
        #print(x.shape)
        x = song
        #print("before",x.shape)
        #x = np.pad(x,(0,signal_length-x.shape[0]))
        #print(x.shape)
        x = torch.tensor(x).to("cuda", dtype=torch.float)
        spec0 = Spectrogram.MelSpectrogram(n_fft=1024,
                                           n_mels=128,
                                           trainable_mel=False,
                                           trainable_STFT=False,
                                           fmin=15,
                                           fmax=11000,
                                           sr=sr,
                                           pad_mode='reflect')
        spec1 = Spectrogram.STFT(n_fft=1024, hop_length=512, trainable=False)
        spec2 = Spectrogram.MFCC(sr=22050,
                                 n_mfcc=200,
                                 norm='ortho',
                                 device='cuda:0',
                                 verbose=True)

        x0 = spec0(x)
        x0 = x0.unsqueeze(1)

        x1 = spec1(x)
        x1 = x1[:, :, :, 0].unsqueeze(1)

        x2 = spec2(x)
        x2 = x2.unsqueeze(1)

        x0 = nnf.interpolate(x0,
                             size=(224, 224),
                             mode='bicubic',
                             align_corners=False)
        x1 = nnf.interpolate(x1,
                             size=(224, 224),
                             mode='bicubic',
                             align_corners=False)
        x2 = nnf.interpolate(x2,
                             size=(224, 224),
                             mode='bicubic',
                             align_corners=False)

        img = torch.cat((x0, x1, x2), dim=1)

        aud1 = aud.replace('.wav', '')
        save_image(img, f'{save_path}{aud1}.jpeg')
Example #3
0
 def __init__(
     self,
     n_fft=1024,
     hop_length=256,
     win_length=1024,
     n_bins=84,
     sampling_rate=22050,
 ):
     super().__init__()
     ##############################################
     # FFT Parameters                              #
     ##############################################
     window = torch.hann_window(win_length).float()
     cqt_basis, lengths = librosa_cqt_fn(sampling_rate,
                                         n_bins=n_bins,
                                         filter_scale=0.5)
     cqt_basis = cqt_basis.astype(dtype=np.float32)
     cqt_basis = torch.from_numpy(cqt_basis).float()
     self.register_buffer("cqt_basis", cqt_basis)
     self.register_buffer("window", window)
     self.n_fft = n_fft
     self.n_bins = n_bins
     self.hop_length = hop_length
     self.win_length = win_length
     self.sampling_rate = sampling_rate
     self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate,
                                             n_bins=84,
                                             hop_length=hop_length,
                                             output_format='Magnitude',
                                             pad_mode='constant',
                                             device='cuda:0',
                                             verbose=False,
                                             trainable=False)
 def __init__(self):
     super(Model, self).__init__()
     f_kernal = 128 // network_factor
     self.STFT_layer = Spectrogram.MelSpectrogram(sr=44100,
                                                  n_fft=n_fft,
                                                  n_mels=n_mels,
                                                  hop_length=HOP_LENGTH,
                                                  pad_mode='constant',
                                                  trainable_mel=True,
                                                  center=True,
                                                  device=device)
     self.freq_cnn1 = torch.nn.Conv2d(1,
                                      4, (f_kernal, 3),
                                      stride=(8, 1),
                                      padding=1)
     self.freq_cnn2 = torch.nn.Conv2d(4,
                                      8, (f_kernal, 3),
                                      stride=(8, 1),
                                      padding=1)
     shape = self.shape_inference(f_kernal)
     self.bilstm = torch.nn.LSTM(shape * 8,
                                 shape * 8,
                                 batch_first=True,
                                 bidirectional=True)
     self.pitch_classifier = torch.nn.Linear(shape * 8 * 2, 88)
    def __init__(self, avg=.9998):
        super(Model, self).__init__()
        # Getting Mel Spectrogram on the fly
        self.spec_layer = Spectrogram.STFT(sr=44100,
                                           n_fft=n_fft,
                                           freq_bins=freq_bins,
                                           fmin=50,
                                           fmax=6000,
                                           freq_scale='log',
                                           pad_mode='constant',
                                           center=True)
        self.n_bins = freq_bins
        # Creating Layers
        self.CNN_freq_kernel_size = (128, 1)
        self.CNN_freq_kernel_stride = (2, 1)
        k_out = 128
        k2_out = 256

        self.CNN_freq = nn.Conv2d(1,
                                  k_out,
                                  kernel_size=self.CNN_freq_kernel_size,
                                  stride=self.CNN_freq_kernel_stride)
        self.CNN_time = nn.Conv2d(k_out,
                                  k2_out,
                                  kernel_size=(1, regions),
                                  stride=(1, 1))

        self.region_v = 1 + (self.n_bins - self.CNN_freq_kernel_size[0]
                             ) // self.CNN_freq_kernel_stride[0]
        self.linear = torch.nn.Linear(k2_out * self.region_v, m, bias=False)

        self.avg = avg
Example #6
0
def main():
    args = parse_args()
    save_type = args.save_type
    spec_layer = Spectrogram.CQT1992v2(sr=22050, n_bins=84, hop_length=256, pad_mode='constant', device='cuda:0', verbose=False, trainable=False, output_format='Magnitude')
    transformedSet = AudioDataset('input_audio.txt', 22050 * 4, sampling_rate=22050, augment=False)
    transformedLoader = DataLoader(transformedSet, batch_size=1)
    transformedVoc = []
    f = open('input_audio.txt', 'r')
    lines = f.readlines()
    lines = list(map(lambda s: s.strip(), lines)) #remove newline character
    lines = [track.replace('.wav', '') for track in lines] #remove .wav
    print(lines)
    if len(lines) != len(transformedLoader):
        print('Differences in wavs found and whats in input_audio.txt')
        return

    for i, x_t in enumerate(transformedLoader):
        x_t = x_t.cuda()
        s_t = spec_layer(x_t).detach()
        s_t = torch.log(torch.clamp(s_t, min=1e-5))
        transformedVoc.append(s_t.cuda())
    
    if (save_type == 'torch'):
        print('Saving WAVs as torch pt')
        for x in range(0, len(transformedVoc)):
            torch.save(transformedVoc[x], lines[x] + '.pt')
    if (save_type == 'png'):
        print('Saving WAVs as image via matplotlib')
        for x in range(0, len(transformedVoc)):
            save_spec_images(transformedVoc[x], lines[x])
Example #7
0
 def __init__(self):
     super(Model, self).__init__()
     self.spec_layer = Spectrogram.MelSpectrogram(
         sr=44100,
         n_mels=156,
         hop_length=441,
         window='hann',
         center=True,
         pad_mode='constant',
         fmin=20,
         fmax=4200,
         norm=1,
         trainable_mel=False,
         trainable_STFT=False).to(device)
     k_out = 64
     self.CNN_freq_kernel_size = (64, 1)
     self.CNN_freq_kernel_stride = (2, 1)
     self.CNN_freq = nn.Conv2d(1,
                               k_out,
                               kernel_size=self.CNN_freq_kernel_size,
                               stride=self.CNN_freq_kernel_stride)
     self.lstm = torch.nn.LSTM(input_size=3008,
                               hidden_size=1024,
                               bias=False,
                               dropout=0.2,
                               batch_first=True)
     self.linear = torch.nn.Linear(1024, 88, bias=False)
Example #8
0
 def __init__(self):
     super(Model, self).__init__()
     # Getting Mel Spectrogram on the fly
     self.spec_layer = Spectrogram.MelSpectrogram(
         sr=44100,
         n_mels=192,
         hop_length=441,
         window='hann',
         center=True,
         pad_mode='reflect',
         fmin=20,
         fmax=4200,
         norm=1,
         trainable_mel=False,
         trainable_STFT=False).to(device)
     k_out = 8
     k2_out = 16
     self.CNN_freq_kernel_size = (64, 1)
     self.CNN_freq_kernel_stride = (4, 1)
     self.CNN_freq = nn.Conv2d(1,
                               k_out,
                               kernel_size=self.CNN_freq_kernel_size,
                               stride=self.CNN_freq_kernel_stride,
                               padding=(2, 0))
     self.CNN_time = nn.Conv2d(k_out,
                               k2_out,
                               kernel_size=(1, 6),
                               stride=(1, 1),
                               padding=(0, 2))
     self.linear = torch.nn.Linear(2992, 748, bias=False)
     self.linear2 = torch.nn.Linear(748, 88, bias=False)
Example #9
0
 def __init__(self, n_outputs, kernel_size, sr):
     super().__init__()
     self.spec = Spectrogram.STFT(
         n_fft=2048,
         freq_bins=81,
         hop_length=441,
         window="hann",
         freq_scale="log",
         center=True,
         pad_mode="reflect",
         fmin=30,
         fmax=17000,
         sr=sr,
     )
     self.conv2d_1 = nn.Conv2d(
         in_channels=1,
         out_channels=n_outputs,
         kernel_size=kernel_size,
         stride=1,
         padding=0,
         groups=1,
         bias=True,
         padding_mode="zeros",
     )
     self.maxPool2d_1 = nn.MaxPool2d(kernel_size=(1, 3),
                                     padding=0,
                                     return_indices=False,
                                     ceil_mode=False)
     self.conv2d_2 = nn.Conv2d(
         in_channels=n_outputs,
         out_channels=n_outputs,
         kernel_size=kernel_size,
         stride=1,
         padding=0,
         groups=1,
         bias=True,
         padding_mode="zeros",
     )
     self.maxPool2d_2 = nn.MaxPool2d(kernel_size=(1, 3),
                                     padding=0,
                                     return_indices=False,
                                     ceil_mode=False)
     self.conv2d_3 = nn.Conv2d(
         in_channels=n_outputs,
         out_channels=n_outputs,
         kernel_size=(1, 8),
         stride=1,
         padding=0,
         groups=1,
         bias=True,
         padding_mode="zeros",
     )
     self.net = nn.Sequential(
         self.conv2d_1,
         self.maxPool2d_1,
         self.conv2d_2,
         self.maxPool2d_2,
         self.conv2d_3,
     )
 def __init__(self):
     super(MelnnAudio, self).__init__()
     self.ta = MelSpectrogram(sample_rate=8000)
     self.nna = nnASpectrogram.MelSpectrogram(sr=8000,
                                              n_fft=400,
                                              device='cpu',
                                              norm=None)
     self.mask = torch.nn.Parameter(torch.ones([128, 81]))
Example #11
0
 def __init__(self,
              sr=32000,
              n_fft=1600,
              hop_length=400,
              n_mels=240,
              fmin=100,
              fmax=15000,
              device=None,
              clip_length=3000):
     self.sr = sr
     self.n_fft = n_fft
     self.hop_length = hop_length
     self.n_mels = n_mels
     self.fmin = fmin
     self.fmax = fmax
     self.clip_length = clip_length
     if device:
         self.device = device
     else:
         self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
     self.spec_layer = Spectrogram.STFT(sr=sr,
                                        n_fft=n_fft,
                                        hop_length=hop_length).to(
                                            self.device)
     self.spec_mel_layer = Spectrogram.MelSpectrogram(sr=sr,
                                                      n_fft=n_fft,
                                                      n_mels=n_mels,
                                                      hop_length=hop_length,
                                                      window='hann',
                                                      center=True,
                                                      pad_mode='reflect',
                                                      power=2.0,
                                                      htk=False,
                                                      fmin=fmin,
                                                      fmax=fmax,
                                                      norm=1,
                                                      verbose=True).to(
                                                          self.device)
     self.rainbow_img = torch.tensor([],
                                     dtype=torch.float32,
                                     device=self.device)
     self.model_path = None
     self.model = None
     self.names = None
     self.soundclasses = None
Example #12
0
 def __init__(self):
     super(Model, self).__init__()
     f_kernal = 128//network_factor
     self.STFT_layer = Spectrogram.CQT1992v2(sr=44100, fmin=27.5, n_bins=n_bins, bins_per_octave=bins_per_octave, pad_mode='constant', hop_length=HOP_LENGTH, center=True, device=device)
     self.freq_cnn1 = torch.nn.Conv2d(1,4, (f_kernal,3), stride=(8,1), padding=1)
     self.freq_cnn2 = torch.nn.Conv2d(4,8, (f_kernal,3), stride=(8,1), padding=1)
     shape = self.shape_inference(f_kernal)
     self.bilstm = torch.nn.LSTM(shape*8, shape*8, batch_first=True, bidirectional=True)
     self.pitch_classifier = torch.nn.Linear(shape*8*2, 88)
Example #13
0
 def __init__(self, sample_rate, n_mels=128, hop_length=512, eps=1e-6,
              normalize_spectro=True, device=torch.device("cuda:0"), **kwargs):
     store_attr('sample_rate'), store_attr('n_mels'), store_attr('hop_length')
     store_attr('eps')
     super().__init__(**kwargs)
     self.spectro = Spectrogram.MelSpectrogram(
         sr=sample_rate, n_mels=n_mels, hop_length=hop_length,
         verbose=False, **kwargs).to(device)
     self.relu = nn.ReLU(inplace=True)
     self.normalize_spectro = normalize_spectro
     self.eps = eps
     self.device = device
Example #14
0
 def __init__(self):
     config = dict(
         sr=16000,
         n_fft=400,
         n_mels=64,
         hop_length=160,
         window='hann',
         center=False,
         pad_mode='reflect',
         htk=True,
         fmin=125,
         fmax=7500,
     )
     self.to_spec = Spectrogram.MelSpectrogram(**config)
Example #15
0
def main():
    args = parse_args()
    file_name = args.input_file
    save_path = args.save_path
    spec_layer = Spectrogram.CQT1992v2(sr=22050,
                                       n_bins=84,
                                       hop_length=256,
                                       pad_mode='constant',
                                       device='cuda:0',
                                       verbose=False,
                                       trainable=False,
                                       output_format='Magnitude')
    transformedSet = AudioConversionDataset(file_name,
                                            22050 * 4,
                                            sampling_rate=22050,
                                            augment=False)
    transformedLoader = DataLoader(transformedSet, batch_size=1)
    f = open(file_name, 'r')
    lines = f.readlines()
    lines = list(map(lambda s: s.strip(), lines))  #remove newline character
    lines = [track.replace('.wav', '') for track in lines]  #remove .wav
    lines = [track.split("/")[-1] for track in lines]
    if len(lines) != len(transformedLoader):
        print('Differences in wavs found and whats in input_audio.txt')
        return

    for i, x in tqdm(enumerate(transformedLoader),
                     ascii=True,
                     desc='Making spectrogram representations'):
        x_t = x[0]
        fname = os.path.basename(x[1][0]).replace('.wav', '')
        x_t = x_t.cuda()
        s_t = spec_layer(x_t).detach()
        s_t = torch.log(torch.clamp(s_t, min=1e-5))
        if args.save_type == 'pt':
            torch.save(s_t.cuda(), save_path + fname + '.pt')
        else:
            save_image(s_t.cuda(), save_path + fname + '.png', normalize=True)
            min_value = torch.min(s_t.cuda()).item()
            max_value = torch.max(s_t.cuda()).item()
            normalisation_dict[fname] = {"min": min_value, "max": max_value}
    with open(save_path + 'normalisation_values.json', 'w') as outfile:
        json.dump(normalisation_dict, outfile, indent=4)
Example #16
0
 def __init__(
     self,
     hop_length=256,
     n_bins=84,
     sampling_rate=22050,
 ):
     super().__init__()
     ##############################################
     # FFT Parameters                              #
     ##############################################
     self.n_bins = n_bins
     self.hop_length = hop_length
     self.sampling_rate = sampling_rate
     self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate,
                                             n_bins=84,
                                             hop_length=hop_length,
                                             output_format='Magnitude',
                                             pad_mode='constant',
                                             device='cuda:0',
                                             verbose=False,
                                             trainable=False)
Example #17
0
def melspectrogram(wav,
                   sample_rate=22050,
                   n_fft=2048,
                   n_mels=128,
                   hop_length=512,
                   window='hann',
                   center=True,
                   pad_mode='reflect',
                   power=2.0,
                   htk=False,
                   fmin=0.0,
                   fmax=None,
                   norm=1,
                   trainable_mel=False,
                   trainable_STFT=False,
                   verbose=False,
                   eps=1e-6,
                   cuda=False,
                   log=True,
                   **kwargs):
    s = Spectrogram.MelSpectrogram(sr=sample_rate,
                                   n_fft=n_fft,
                                   n_mels=n_mels,
                                   hop_length=hop_length,
                                   window=window,
                                   center=center,
                                   pad_mode=pad_mode,
                                   power=power,
                                   htk=htk,
                                   fmin=fmin,
                                   fmax=fmax,
                                   norm=norm,
                                   trainable_mel=trainable_mel,
                                   trainable_STFT=trainable_STFT,
                                   verbose=verbose,
                                   **kwargs)
    if cuda: s = s.cuda()
    return torch.log(s(wav) + eps) if log else s(wav)
Example #18
0
    def __init__(self, roll=False, return_cqt=False, output=None, drop='last'):
        super(PyTorch, self).__init__()

        self.cqt = Spectrogram.CQT2010v2(
            sr=44100,
            hop_length=64,
            n_bins=84*10,
            bins_per_octave=12*10,
            norm=1,
            window='hann',
            pad_mode='constant',
            trainable=False,
        )
        
        self.roll = roll
        self.drop = drop
        self.return_cqt = return_cqt
        self.output = output

        self.bn0 = nn.BatchNorm1d(840)
        self.conv1 = nn.Conv1d(840, 1024, 1)
        self.bn1 = nn.BatchNorm1d(1024)
        self.conv2 = nn.Conv1d(1024, 512, 1)
        self.bn2 = nn.BatchNorm1d(512)
        self.conv3 = nn.Conv1d(512, 256, 1)
        self.bn3 = nn.BatchNorm1d(256)
        self.conv4 = nn.Conv1d(256, 48, 1)
        self.bn4 = nn.BatchNorm1d(48)  
        
        self.LSTM = nn.LSTM(48, 512, batch_first=True, num_layers=2, dropout=0.25)        
        self.conv5 = nn.Conv1d(512, 48, 1)
        
        self.dropout_1 = nn.Dropout(p=0.25)
        self.dropout_2 = nn.Dropout(p=0.25)
        self.dropout_3 = nn.Dropout(p=0.25)
        self.dropout_4 = nn.Dropout(p=0.25)
Example #19
0
import nnAudio.Spectrogram as Spec
from plots import plot_cqt
from parameters import *

if USE_CQT:
    cqt_layer = Spec.CQT(sr=FS,
                         hop_length=HOP_LENGTH,
                         fmin=F_MIN,
                         n_bins=N_BINS,
                         bins_per_octave=BINS_PER_OCTAVE,
                         norm=NORM,
                         pad_mode='constant',
                         window=WINDOW)
    cqt_layer.to(DEVICE)


def cqt(signal, numpy=True, db=True):
    time_array = np.arange(np.ceil(
        signal.size / HOP_LENGTH).astype(int)) / (FS / HOP_LENGTH)

    signal_tensor = torch.tensor(signal, device=DEVICE, dtype=torch.float)
    cqt_tensor = cqt_layer(signal_tensor, normalization_type='wrap')

    if db:
        cqt_tensor = 20 * torch.log10(cqt_tensor + EPS)

    if numpy:
        cqt_array = cqt_tensor.cpu().numpy()[0, :, :]
        torch.cuda.empty_cache()
        return cqt_array, time_array
    else:
Example #20
0
    def __init__(self, input_repr, embedding_size, weight_file):
        super(PytorchOpenl3, self).__init__()
        self.__weights_dict = load_weights(weight_file)
        self.AUDIO_POOLING_SIZES = {
            "mel128": {
                512: (16, 24),
                6144: (4, 8)
            },
            "mel256": {
                512: (32, 24),
                6144: (8, 8)
            },
        }
        if input_repr == 'mel128':
            self.speclayer = Spectrogram.MelSpectrogram(sr=48000,
                                                        n_fft=2048,
                                                        n_mels=128,
                                                        hop_length=242,
                                                        power=1.0,
                                                        htk=True)
        else:
            self.speclayer = Spectrogram.MelSpectrogram(sr=48000,
                                                        n_fft=2048,
                                                        n_mels=256,
                                                        hop_length=242,
                                                        power=1.0,
                                                        htk=True)

        self.input_repr = input_repr
        self.embedding_size = embedding_size
        self.batch_normalization_1 = self.__batch_normalization(
            2,
            "batch_normalization_1",
            num_features=1,
            eps=0.001,
            momentum=0.99)
        self.conv2d_1 = self.__conv(
            2,
            name="conv2d_1",
            in_channels=1,
            out_channels=64,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
        self.batch_normalization_2 = self.__batch_normalization(
            2,
            "batch_normalization_2",
            num_features=64,
            eps=0.001,
            momentum=0.99)
        self.conv2d_2 = self.__conv(
            2,
            name="conv2d_2",
            in_channels=64,
            out_channels=64,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
        self.batch_normalization_3 = self.__batch_normalization(
            2,
            "batch_normalization_3",
            num_features=64,
            eps=0.001,
            momentum=0.99)
        self.conv2d_3 = self.__conv(
            2,
            name="conv2d_3",
            in_channels=64,
            out_channels=128,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
        self.batch_normalization_4 = self.__batch_normalization(
            2,
            "batch_normalization_4",
            num_features=128,
            eps=0.001,
            momentum=0.99)
        self.conv2d_4 = self.__conv(
            2,
            name="conv2d_4",
            in_channels=128,
            out_channels=128,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
        self.batch_normalization_5 = self.__batch_normalization(
            2,
            "batch_normalization_5",
            num_features=128,
            eps=0.001,
            momentum=0.99)
        self.conv2d_5 = self.__conv(
            2,
            name="conv2d_5",
            in_channels=128,
            out_channels=256,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
        self.batch_normalization_6 = self.__batch_normalization(
            2,
            "batch_normalization_6",
            num_features=256,
            eps=0.001,
            momentum=0.99)
        self.conv2d_6 = self.__conv(
            2,
            name="conv2d_6",
            in_channels=256,
            out_channels=256,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
        self.batch_normalization_7 = self.__batch_normalization(
            2,
            "batch_normalization_7",
            num_features=256,
            eps=0.001,
            momentum=0.99)
        self.conv2d_7 = self.__conv(
            2,
            name="conv2d_7",
            in_channels=256,
            out_channels=512,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
        self.batch_normalization_8 = self.__batch_normalization(
            2,
            "batch_normalization_8",
            num_features=512,
            eps=0.001,
            momentum=0.99)
        self.audio_embedding_layer = self.__conv(
            2,
            name="audio_embedding_layer",
            in_channels=512,
            out_channels=512,
            kernel_size=(3, 3),
            stride=(1, 1),
            groups=1,
            bias=True,
        )
Example #21
0
from nnAudio import Spectrogram
from scipy.io import wavfile
import torch
sr, song = wavfile.read('D:/210.wav')  # Loading your audio
x = song.mean(1)  # Converting Stereo  to Mono
x = torch.tensor(x).float()  # casting the array into a PyTorch Tensor

spec_layer = Spectrogram.STFT(n_fft=2048,
                              freq_bins=None,
                              hop_length=512,
                              window='hann',
                              freq_scale='linear',
                              center=True,
                              pad_mode='reflect',
                              fmin=50,
                              fmax=11025,
                              sr=sr)  # Initializing the model

spec = spec_layer(x)
print(spec)
    def __init__(self, cfg, x, output_dim=1):
        super(SpecMod1, self).__init__()
        self.cfg = cfg
        
        ## add magnitude?
        if cfg.add_mag: x = add_magnitude(x)
        print(x.shape)
        
        n_fft = 32 ## 
        hop_length = n_fft//4
        out_channels = 64
        kernels = [3,3,3,3]
        strides = [2,2,2,2]
        lins = [32]
        
        trainable = False
        spec_layer_1 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable)
        spec_layer_2 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable)
        self.spec_layers = nn.ModuleList([spec_layer_1, spec_layer_2])
        x1 = apply_stft(spec_layer_1, x[:, :x.shape[1]//2, :])
        x2 = apply_stft(spec_layer_2, x[:, x.shape[1]//2:, :])
        print(x1.shape)
        x = torch.cat((x1,x2),1)
        print(x.shape)
        
        self.drop = torch.nn.Dropout(p=cfg.dropout)
#         self.drop = torch.nn.Dropout(p=0.2)
        
        bs, in_channels, h, w = x.shape
        i = 0
        conv_layers, lin_layers = [],[]
        
        conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=1)
        conv_layers.append(conv)
        x = conv(x)
        print(x.shape)
        i, in_channels, out_channels = i+1, out_channels, out_channels*2
        
        conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=1)
        conv_layers.append(conv)
        x = conv(x)
        print(x.shape)
        i, in_channels, out_channels = i+1, out_channels, out_channels*2
        
        conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=1)
        conv_layers.append(conv)
        x = conv(x)
        print(x.shape)
        i, in_channels, out_channels = i+1, out_channels, out_channels*2
        
        conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernels[i], stride=strides[i], padding=0)
        conv_layers.append(conv)
        x = conv(x)
        print(x.shape)
        
        self.conv_layers = nn.ModuleList(conv_layers)
        
        x = x.view(bs, -1)
        self.n = x.shape[1]
        print(x.shape)
        
        n1 = self.n
        
        for n2 in lins:
            lin_layer = nn.Linear(in_features=n1, out_features=n2)
            lin_layers.append(lin_layer)
            x = lin_layer(x)
            print(x.shape)
            n1 = n2
        
        ## final lin layer
        lin_layer = nn.Linear(in_features=n1, out_features=output_dim)
        lin_layers.append(lin_layer)
        x = lin_layer(x)
        print(x.shape)
        
        self.lin_layers = nn.ModuleList(lin_layers)
    def __init__(self, cfg, x, output_dim=1):
        super(SpecMod2, self).__init__()
        self.cfg = cfg
        self.drop = torch.nn.Dropout(p=cfg.dropout)
        
        # cnn = [(128,64), (32,8), (3,2), (3,2), (3,2), (3,2), 32]
        K = cfg.cnn
        win, out_channels = K[0]
        
        ## add magnitude?
        if cfg.add_mag: x = add_magnitude(x)
        print(x.shape)
        
        bs, in_channels, _ = x.shape
        
        n_fft = K[1][0]         ## 32
        hop_length = K[1][1]    ## n_fft//4
        m = sum([isinstance(x, int) for x in K])
        if m==0:
            kernels, strides = zip(*K[2:])
            lins = []
        else:
            kernels, strides = zip(*K[2:-m])
            lins = K[-m:]
        pads = np.array(kernels)*0 +1
#         pads = np.zeros(len(kernels)) +1
        pads[-1]=0
        
        trainable = False
        spec_layer_1 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable)
        spec_layer_2 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable)
        self.spec_layers = nn.ModuleList([spec_layer_1, spec_layer_2])
        x1 = apply_stft(spec_layer_1, x[:, :x.shape[1]//2, :])
        x2 = apply_stft(spec_layer_2, x[:, x.shape[1]//2:, :])
        print(x1.shape)
        x = torch.cat((x1,x2),1)
        print(x.shape)
        
        i = 0
        conv_layers, lin_layers = [],[]
        
        for k,s,p in zip(kernels, strides, pads):
#             p = (0,0)
            conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=k, stride=s, padding=p)
            conv_layers.append(conv)
            x = conv(x)
            print(x.shape)
            in_channels, out_channels = out_channels, out_channels*2
            
        x = x.view(bs, -1)
        self.n = x.shape[1]
        print(x.shape)
        
        n1 = self.n
        
        for n2 in lins:
            lin_layer = nn.Linear(in_features=n1, out_features=n2)
            lin_layers.append(lin_layer)
            x = lin_layer(x)
            print(x.shape)
            n1 = n2
        
        ## final lin layer
        lin_layer = nn.Linear(in_features=n1, out_features=output_dim)
        lin_layers.append(lin_layer)
        x = lin_layer(x)
        print(x.shape)
        
        self.conv_layers = nn.ModuleList(conv_layers)
        self.lin_layers = nn.ModuleList(lin_layers)
Example #24
0
        audio_name = self.file_list[idx]
        sr, wav = wavfile.read(audio_name)

        return wav


if __name__ == '__main__':
    dataset = MusicNet()
    dataset = DataLoader(dataset, shuffle=False, num_workers=8)
    result = {}
    # STFT

    n_fft_ls = [256, 512, 1024, 2048, 4096]
    for n_fft in n_fft_ls:
        layer = Spectrogram.STFT(n_fft=n_fft,
                                 hop_length=512,
                                 verbose=False,
                                 device=device)
        start = time.time()
        for i in tqdm.tqdm(dataset):
            i = i.to(device)
            layer(i)
        result[f'STFT_{n_fft}'] = time.time() - start

    n_fft_ls = [256, 512, 1024, 2048, 4096]
    for n_fft in n_fft_ls:
        layer = Spectrogram.STFT(n_fft=n_fft,
                                 hop_length=512,
                                 freq_scale='log',
                                 sr=44100,
                                 fmin=1,
                                 fmax=22050,
Example #25
0
        device = "cpu"
        print("using CPU")
    elif args.device in ["GPU", "torchaudio", 'tensorflow']:
        device = f"cuda:0"
        print("using GPU")
    elif args.device == "librosa":
        print("using librosa")

    y_list = np.load(Path(__file__).parent / './y_list.npy')

    if args.device in ["CPU", "GPU"]:
        import torch
        import torch.nn as nn
        from nnAudio import Spectrogram
        y_torch = torch.tensor(y_list, device=device).float()
        spec_layer = Spectrogram.STFT(device=device)
        timing = []
        for e in range(20):
            t_start = time.time()
            spec = spec_layer(y_torch[:1000])
            spec = spec_layer(y_torch[1000:])
            time_used = time.time() - t_start
            #     print(time_used)
            timing.append(time_used)
        print("mean = ", np.mean(timing))
        print("std = ", np.std(timing))

        data = pd.DataFrame(timing, columns=['t_avg'])
        data['Type'] = f'torch_{args.device}'
        print('saving file')
        data.to_csv(
    test_s_dl = DataLoader(test_s_ds,
                           batch_size=args["batch_size"],
                           shuffle=False,
                           num_workers=0)

    # load model
    model = NMSLatentDisentangledDynamic(input_dims=MELSPEC_DIM,
                                         hidden_dims=args["hidden_dims"],
                                         z_dims=args["z_dims"],
                                         n_component=NUM_EMOTIONS)
    model.cuda()
    optimizer = optim.Adam(model.parameters(),
                           lr=args['lr'],
                           betas=(0.9, 0.98),
                           eps=1e-9)

    wav_to_melspec = Spectrogram.MelSpectrogram(sr=16000,
                                                n_mels=MELSPEC_DIM,
                                                hop_length=args['hop_size'])
    normalizer = Normalizer(mode="imagewise")

    # load writers
    current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    train_log_dir = 'logs/' + args[
        'name'] + '_dynamic_v2/' + current_time + '/train'
    eval_log_dir = 'logs/' + args[
        'name'] + '_dynamic_v2/' + current_time + '/eval'
    train_sup_writer = SummaryWriter(train_log_dir + "_sup")
    eval_sup_writer = SummaryWriter(eval_log_dir + "_sup")

    training()
Example #27
0
    print("using CPU")
elif args.device in ["GPU", "torchaudio", 'tensorflow']:
    device = f"cuda:0"
    print("using GPU")
elif args.device == "librosa":
    print("using librosa")

y_list = np.load(Path(__file__).parent / './y_list.npy')

if args.device in ["CPU", "GPU"]:
    import torch
    import torch.nn as nn
    from nnAudio import Spectrogram
    y_torch = torch.tensor(y_list, device=device).float()

    spec_layer = Spectrogram.MelSpectrogram(sr=44100, device=device)
    timing = []
    for e in range(20):
        t_start = time.time()
        spec = spec_layer(y_torch[:1000])
        spec = spec_layer(y_torch[1000:])
        time_used = time.time() - t_start
        #     print(time_used)
        timing.append(time_used)

    print("mean = ", np.mean(timing))
    print("std = ", np.std(timing))

    data = pd.DataFrame(timing, columns=['t_avg'])
    data['Type'] = f'torch_{args.device}'
    data.to_csv(Path(__file__).parent / f'./result/Mel_torch_{args.device}')
    def __init__(self, cfg, x, output_dim=1):
        super(CNN_12b, self).__init__()
        self.cfg = cfg
        
        if cfg.gpu_preproc:
            ####################
            x = preproc(x, cfg)
            ####################
#             '''
#             print(x.shape)
#             spec_layer = Spectrogram.STFT(n_fft=24,
# #                                           freq_bins=None, 
#                                           hop_length=14, 
# #                                           window='hann', 
#                                           freq_scale='no', 
# #                                           center=True, 
# #                                           pad_mode='reflect', 
# #                                           fmin=50,
# #                                           fmax=6000, 
# #                                           sr=22050, 
# #                                           trainable=False, 
# #                                           output_format='Magnitude', 
# #                                           device='cuda:0'
#                                           )
            if self.cfg.feats_fft:
                n_fft = 24 ## 24  32  64
                hop_length = n_fft//2 + 2
                trainable = False
    
                spec_layer_1 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable)
                spec_layer_2 = Spectrogram.STFT(n_fft=n_fft, hop_length=hop_length, freq_scale='no', device='cpu', trainable=trainable)
                x1 = apply_spec(spec_layer_1, x[:, :x.shape[1]//2, :])
                x2 = apply_spec(spec_layer_2, x[:, x.shape[1]//2:, :])
                f = torch.cat((x1,x2),1)
                self.spec_layers = nn.ModuleList([spec_layer_1, spec_layer_2])
        
                if cfg.feats_raw:
                    x = torch.cat((x, f), 1)
                else:
                    x = f

        self.c = c = x.shape[1]//2
        x = x[:, :c, :]
            
        print(x.shape)
        
        K = cfg.cnn
        win,f = K[0]
        fin,fout = c,f
        
        i, conv_layers_1, conv_layers_2 = 1,[],[]
        while isinstance(K[i], (list, tuple)):
            k,i = K[i],i+1
            conv1 = get_conv_layer(k, fin, fout)
            conv2 = get_conv_layer(k, fin, fout)
            conv_layers_1.append(conv1)
            conv_layers_2.append(conv2)
            
            if k[0]>0:
                fin,fout = fout,fout*2
            else:
                fin,fout = fout//2,fout
            
            x = conv1(x)
            print(x.shape)
            #print('fin={},fout={}'.format(fin, fout))
        
        f = fin
        self.f = f
        x = x.view(-1, f)
        print(x.shape)
        
        merged = False
        n1 = f
        lin_layers_1, lin_layers_2, lin_layers = [],[],[]
        while i<len(K):
            n2,i = K[i],i+1
            
            if n2<0 and not merged:
                x = torch.cat((x,x),1)
                print(x.shape)
                n1 = n1*2
                n2 = abs(n2)
                merged = True
                
            if merged:
                lin = torch.nn.Linear(n1, n2)
                lin_layers.append(lin)
                x = lin(x)
            else:
                lin1 = torch.nn.Linear(n1, n2)
                lin_layers_1.append(lin1)
                lin2 = torch.nn.Linear(n1, n2)
                lin_layers_2.append(lin2)
                x = lin1(x)
            n1 = n2
            print(x.shape)
        
        if not merged:
            n1 = n1*2
            x = torch.cat((x,x),1)
            print(x.shape)
        
        lin = torch.nn.Linear(n1, output_dim)
        lin_layers.append(lin)
        x = lin(x)
        print(x.shape)
        
        self.conv_layers_1 = nn.ModuleList(conv_layers_1)
        self.conv_layers_2 = nn.ModuleList(conv_layers_2)
        self.lin_layers_1 = nn.ModuleList(lin_layers_1)
        self.lin_layers_2 = nn.ModuleList(lin_layers_2)
        self.lin_layers = nn.ModuleList(lin_layers)
        
        drop_layers = []
        if not isinstance(cfg.dropout, (list, tuple)):
            drop_layers.append(nn.Dropout(p=cfg.dropout))
        else:
            for drop in cfg.dropout:
                drop_layers.append(nn.Dropout(p=drop))
        self.drop_layers = nn.ModuleList(drop_layers)
Example #29
0
    print("Using CPU")
    device = "cpu"

aug = Augment(Chords())
config = yaml.load(open("./config/config.yaml"))
sr = config['preprocess']['sample_rate']
hop_size = config['preprocess']['hop_size']
window_size = config['preprocess']['window_size']
song_hz = config['preprocess']['song_hz']
save_dir = config['preprocess']['save_dir']
cqt_layer = Spectrogram.CQT(device=device,
                            sr=sr,
                            hop_length=hop_size,
                            fmin=220,
                            fmax=None,
                            n_bins=108,
                            bins_per_octave=24,
                            norm=1,
                            window='hann',
                            center=True,
                            pad_mode='reflect')
p = Preprocess(sr, hop_size, song_hz, window_size, save_dir, aug, cqt_layer)

num_epochs = config['model'].get('num_epochs')


def get_data():
    datasets = {
        "isophonics-beetles": {
            "mp3": config['preprocess']['data_path'] + "/beetles_albums",
            "labels":
Example #30
0
elif args.device == "GPU":
    device = "cuda:0"
    print("using GPU")
elif args.device == "librosa":
    print("using librosa")

print(Path(__file__).parent / './y_list.npy')

y_list = np.load(Path(__file__).parent / './y_list.npy')

if args.device in ["CPU", "GPU"]:
    y_torch = torch.tensor(y_list, device=device).float()

    spec_layer = Spectrogram.CQT1992v2(sr=44100,
                                       n_bins=84,
                                       bins_per_octave=24,
                                       fmin=55,
                                       device=device)
    timing = []
    for e in range(20):
        t_start = time.time()
        spec = spec_layer(y_torch[:1000])
        spec = spec_layer(y_torch[1000:])
        time_used = time.time() - t_start
        #     print(time_used)
        timing.append(time_used)

    print("mean = ", np.mean(timing))
    print("std = ", np.std(timing))

    data = pd.DataFrame(timing, columns=['t_avg'])