Exemple #1
0
    def __init__(self,
                 data_dir='/home/syl20/data/en/librispeech',
                 train_set='train-clean-5',
                 val_set='dev-clean-2',
                 test_set='dev-clean-2',
                 batch_size=64,
                 num_workers=60,
                 sample_rate=16000,
                 n_mels=128,
                 freq_mask_param=15,
                 time_mask_param=35):
        super().__init__()
        self.data_dir = data_dir
        self.train_set = train_set
        self.val_set = val_set
        self.test_set = test_set
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.freq_mask_param = freq_mask_param
        self.time_mask_param = time_mask_param

        self.val_transform = MelSpectrogram(sample_rate=self.sample_rate,
                                            n_mels=self.n_mels)
        self.train_transform = nn.Sequential(
            MelSpectrogram(sample_rate=self.sample_rate, n_mels=self.n_mels),
            FrequencyMasking(freq_mask_param=self.freq_mask_param),
            TimeMasking(time_mask_param=self.time_mask_param))
Exemple #2
0
 def __init__(self, sample_rate=44100, n_fft=int(400 / 16000 * 44100)):
     super().__init__()
     self.spec_transform = MelSpectrogram(n_mels=80,
                                          sample_rate=sample_rate,
                                          n_fft=n_fft)
     self.vtlp_transform = apply_vtlp(
         MelSpectrogram(n_mels=80, sample_rate=sample_rate, n_fft=n_fft))
     self.delta_transform = ComputeDeltas()
def collate_fn(data, device=device):
    data = torch.stack(data)
    x = MelSpectrogram(sample_rate=sample_rate)(data)
    x = AmplitudeToDB(stype='power', top_db=80)(x)
    maxval = x.max()
    minval = x.min()
    x = (x-minval)/(maxval - minval)
    return x
Exemple #4
0
 def __init__(self, sample_rate, n_fft, top_db, max_perc):
     super().__init__()
     self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1)
     self.stft = Spectrogram(n_fft=n_fft, power=None)
     self.com_norm = ComplexNorm(power=2.)
     self.mel_specgram = MelSpectrogram(sample_rate,
                                        n_fft=n_fft,
                                        f_max=8000)
     self.AtoDB = AmplitudeToDB(top_db=top_db)
     self.dist = Uniform(1. - max_perc, 1 + max_perc)
Exemple #5
0
 def __init__(self, sample_rate: int, mel_size: int, n_fft: int, win_length: int,
              hop_length: int, mel_min: float = 0., mel_max: float = None):
     super().__init__()
     self.mel_size = mel_size
     self.melfunc = MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, win_length=win_length,
                                   hop_length=hop_length, f_min=mel_min, f_max=mel_max, n_mels=mel_size,
                                   window_fn=torch.hann_window)
Exemple #6
0
    def test_melspectrogram(self, track_ids: Tensor):
        def file_exists(track_id: int) -> bool:
            return os.path.isfile(get_audio_path_default(track_id))

        def get_melspectrogram_torchaudio(track_id: int) -> Tensor:
            path = get_audio_path_default(track_id)
            effects = [
                ['remix', '2'],
                ['rate', str(SAMPLE_RATE)],
            ]
            waveform, _ = torchaudio.sox_effects.apply_effects_file(
                path, effects)
            return transform(waveform)[0]

        def get_melspectrogram_librosa(track_id: int) -> Tensor:
            new_input, sample_rate = librosa.load(
                get_audio_path_default(track_id))
            return torch.tensor(
                librosa.feature.melspectrogram(new_input, **MEL_KWARGS))

        transform = MelSpectrogram(sample_rate=SAMPLE_RATE,
                                   n_fft=WINDOW_SIZE,
                                   hop_length=WINDOW_STRIDE,
                                   n_mels=N_MELS)

        for batch in track_ids:
            for track_id in batch:
                if not file_exists(track_id):
                    continue
                melspectrogram_torchaudio = get_melspectrogram_torchaudio(
                    track_id)
                melspectrogram_librosa = get_melspectrogram_librosa(track_id)
                self.assertEqual(melspectrogram_torchaudio.size(),
                                 melspectrogram_librosa.size())
Exemple #7
0
 def __init__(self,
              sample_rate: int = 16000,
              input_stack_rate: int = 1,
              model_stack_rate: int = 1,
              max_frames: int = 3000,
              target_tokenizer: Tokenizer = None,
              target_token_indexers: Dict[str, TokenIndexer] = None,
              target_add_start_end_token: bool = False,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._target_tokenizer = target_tokenizer or WordTokenizer()
     self._target_token_indexers = target_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self.input_stack_rate = input_stack_rate
     self.model_stack_rate = model_stack_rate
     self.stack_rate = input_stack_rate * model_stack_rate
     self._target_add_start_end_token = target_add_start_end_token
     self._pad_mode = "wrap" if input_stack_rate == 1 else "constant"
     self._max_frames = max_frames
     self._epoch_num = 0
     self._sample_rate = sample_rate
     win_length = int(sample_rate * 0.025)
     hop_length = int(sample_rate * 0.01)
     n_fft = win_length
     self._mel_spectrogram = MelSpectrogram(sample_rate,
                                            n_fft,
                                            win_length=win_length,
                                            hop_length=hop_length,
                                            n_mels=80)
Exemple #8
0
 def __init__(self, sample_rate: int = 16000, n_mels: int = 40, masking=True):
     super(LogMelSpectrogram, self).__init__()
     self.transform = MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels, n_fft=1024, hop_length=256, f_min=0, f_max=8000)
     self.masking=masking
     if masking:
       self.freq_masking = FrequencyMasking(freq_mask_param=10)
       self.time_masking = TimeMasking(time_mask_param=30)
Exemple #9
0
def main(args):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    waveform, sample_rate, _, _ = LJSPEECH("./", download=True)[0]

    mel_kwargs = {
        'sample_rate': sample_rate,
        'n_fft': 2048,
        'f_min': 40.,
        'n_mels': 80,
        'win_length': 1100,
        'hop_length': 275,
        'mel_scale': 'slaney',
        'norm': 'slaney',
        'power': 1,
    }
    transforms = torch.nn.Sequential(
        MelSpectrogram(**mel_kwargs),
        NormalizeDB(min_level_db=-100, normalization=True),
    )
    mel_specgram = transforms(waveform)

    wavernn_model = wavernn(args.checkpoint_name).eval().to(device)
    wavernn_inference_model = WaveRNNInferenceWrapper(wavernn_model)

    if args.jit:
        wavernn_inference_model = torch.jit.script(wavernn_inference_model)

    with torch.no_grad():
        output = wavernn_inference_model(mel_specgram.to(device),
                                         mulaw=(not args.no_mulaw),
                                         batched=(not args.no_batch_inference),
                                         timesteps=args.batch_timesteps,
                                         overlap=args.batch_overlap,)

    torchaudio.save(args.output_wav_path, output, sample_rate=sample_rate)
    def __init__(self, sample_rate=16000, n_fft=401, hop_length=256, n_mels=23, context_size=7, subsample=16):
        super(LogMel, self).__init__()

        self.stft = MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
        self.pad = nn.ReplicationPad1d(padding=context_size)
        self.context_size = context_size
        self.subsample = subsample
Exemple #11
0
class RondomStretchMelSpectrogram(nn.Module):
    def __init__(self, sample_rate, n_fft, top_db, max_perc):
        super().__init__()
        self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1)
        self.stft = Spectrogram(n_fft=n_fft, power=None)
        self.com_norm = ComplexNorm(power=2.)
        self.mel_specgram = MelSpectrogram(sample_rate,
                                           n_fft=n_fft,
                                           f_max=8000)
        self.AtoDB = AmplitudeToDB(top_db=top_db)
        self.dist = Uniform(1. - max_perc, 1 + max_perc)

    def forward(self, x, train):
        x = self.stft(x)
        if train:
            x = self.time_stretch(x, self.dist.sample().item())
        x = self.com_norm(x)
        x = self.mel_specgram.mel_scale(x)
        x = self.AtoDB(x)

        size = torch.tensor(x.size())

        if size[3] > 157:
            x = x[:, :, :, 0:157]
        else:
            x = torch.cat([
                x,
                torch.cuda.FloatTensor(size[0], size[1], size[2],
                                       157 - size[3]).fill_(0)
            ],
                          dim=3)

        return x
Exemple #12
0
    def __init__(self,
                 sample_rate=20000,
                 use_spectrogram=False,
                 window_size=512,
                 hop_length=256,
                 n_fft=None,
                 pad=0,
                 n_mels=40,
                 root='data/chopped',
                 n_files=None):
        self.root = root
        self.files = os.listdir(root)

        # Mel
        self.use_spectrogram = use_spectrogram
        self.sample_rate = sample_rate
        self.window_size = window_size
        self.hop_length = hop_length
        self.n_fft = n_fft
        self.pad = pad
        self.n_mels = n_mels

        self.mel_spec = MelSpectrogram(sr=self.sample_rate,
                                       ws=self.window_size,
                                       hop=self.hop_length,
                                       n_fft=self.n_fft,
                                       pad=self.pad,
                                       n_mels=self.n_mels)
Exemple #13
0
    def __init__(self,
                 output_class=264,
                 d_size=256,
                 sample_rate=32000,
                 n_fft=2**11,
                 top_db=80):

        super().__init__()
        self.mel = MelSpectrogram(sample_rate, n_fft=n_fft)
        self.norm_db = AmplitudeToDB(top_db=top_db)

        self.conv1 = nn.Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
        self.bn1 = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(0.1)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=3)
        self.dropout = nn.Dropout(0.1)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(0.1)
        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=3)
        self.dropout2 = nn.Dropout(0.1)

        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
        self.bn3 = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU(0.1)
        self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=3)
        self.dropout3 = nn.Dropout(0.1)

        self.lstm = nn.LSTM(12, 128, 2, batch_first=True)
        self.dropout_lstm = nn.Dropout(0.3)
        self.bn_lstm = nn.BatchNorm1d(128)

        self.output = nn.Linear(128, output_class)
def spectrogram(trace: Trace):

    trace.resample(sampling_rate)

    mel_spec = MelSpectrogram(sample_rate=sampling_rate,
                              n_mels=image_height,
                              hop_length=hop_length,
                              power=1,
                              pad_mode='reflect',
                              normalized=True)

    amplitude_to_db = AmplitudeToDB()

    # trace = trace.detrend('linear')
    # trace = trace.detrend('demean')
    trace.data = trace.data - np.mean(trace.data)
    trace = trace.taper(max_length=0.01, max_percentage=0.05)
    trace = trace.trim(starttime=trace.stats.starttime,
                       endtime=trace.stats.starttime + sequence_length_second,
                       pad=True,
                       fill_value=0)
    data = trace.data

    torch_data = torch.tensor(data).type(torch.float32)

    spec = (mel_spec(torch_data))
    spec_db = amplitude_to_db(spec.abs() + 1e-3)
    spec_db = (spec_db - spec_db.min()).numpy()
    # spec_db = (spec_db / spec_db.max()).type(torch.float32)
    return spec_db
Exemple #15
0
    def __init__(self,
                 x_shape,
                 sr=44100,
                 n_fft=1024,
                 n_mels=256,
                 win_len=256,
                 hop_len=128):
        super(ProcessMelSpectrogram, self).__init__()
        # og spectrogram process: sr 11025, n_fft 1024, n_mels 256, win_len 256, hop_len 8
        # og output shape 256, 92
        # librosa default params: sr 22050, n_fft 2048, n_mels ?, win_len 2048, hop_len 512
        # music processing: 93 ms, speech processing: 23 ms (computed by 1/(sr/hop_len))
        self.mel_s = MelSpectrogram(sample_rate=sr,
                                    n_fft=n_fft,
                                    n_mels=n_mels,
                                    win_length=win_len,
                                    hop_length=hop_len)
        self.a_to_db = AmplitudeToDB(top_db=80)

        self.x_shape = [-1] + list(x_shape)
        assert len(self.x_shape) in [2, 3, 4]

        num_samples = np.prod(self.x_shape[1:])
        spec_width = num_samples // hop_len + (num_samples % hop_len > 0)
        self.output_shape = [self.x_shape[0], 1, n_mels, spec_width]
Exemple #16
0
    def __init__(self, 
                 output_class=264,
                 d_size=256,
                 sample_rate=32000, 
                 n_fft=2**11, 
                 top_db=80):
        
        super().__init__()
        self.mel = MelSpectrogram(sample_rate, n_fft=n_fft)
        self.norm_db = AmplitudeToDB(top_db=top_db)

        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
        self.bn1 = nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.relu = nn.ReLU(0.1)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
        self.dropout = nn.Dropout(0.1)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
        self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.relu2 = nn.ReLU(0.1)
        self.maxpool2 = nn.MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
        self.dropout2 = nn.Dropout(0.1)
        
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
        self.bn3 = nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.relu3 = nn.ReLU(0.1)
        self.maxpool3 = nn.MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
        self.dropout3 = nn.Dropout(0.1)
        
        self.lstm = nn.LSTM(4, 128, 2, batch_first=True)
        self.dropout_lstm = nn.Dropout(0.3)
        self.bn_lstm = nn.BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        
        self.output = nn.Linear(128, output_class, bias=True)
 def __init__(self):
     super(MelnnAudio, self).__init__()
     self.ta = MelSpectrogram(sample_rate=8000)
     self.nna = nnASpectrogram.MelSpectrogram(sr=8000,
                                              n_fft=400,
                                              device='cpu',
                                              norm=None)
     self.mask = torch.nn.Parameter(torch.ones([128, 81]))
 def __init__(self, arch, num_classes=10):
     super(ModelCalled, self).__init__()
     self.melspectrogram = MelSpectrogram(sample_rate=16384,   # 与FolderDataset.duration一起,使得mel图的shape=(128, 128),记住设置fmax=8000
                                          n_fft=2048,
                                          hop_length=512,
                                          f_max=8000,
                                          n_mels=128)
     self.power2db = AmplitudeToDB(stype='power')
     self.model = Models.__dict__[arch](num_classes=num_classes)
Exemple #19
0
    def __init__(self, settings: AudioTransformSettings = AudioTransformSettings()):
        super().__init__()
        if settings.use_meyda_spectrogram:
            self.spec_transform = MeydaMelSpectrogram(n_mels=settings.num_mels,
                                                      sample_rate=settings.sample_rate,
                                                      n_fft=settings.num_fft,
                                                      hop_length=settings.hop_length)
        else:
            self.spec_transform = MelSpectrogram(n_mels=settings.num_mels,
                                                 sample_rate=settings.sample_rate,
                                                 n_fft=settings.num_fft,
                                                 hop_length=settings.hop_length)

        self.vtlp_transform = apply_vtlp(MelSpectrogram(n_mels=settings.num_mels,
                                                        sample_rate=settings.sample_rate,
                                                        n_fft=settings.num_fft,
                                                        hop_length=settings.hop_length))
        self.delta_transform = ComputeDeltas()
Exemple #20
0
class RondomStretchMelSpectrogram(nn.Module):
    def __init__(self, sample_rate, n_fft, top_db, max_perc):
        super().__init__()
        self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1)
        self.stft = Spectrogram(n_fft=n_fft, power=None)
        self.com_norm = ComplexNorm(power=2.)
        self.fm = FrequencyMasking(100)
        self.tm = TimeMasking(100)
        self.mel_specgram = MelSpectrogram(sample_rate,
                                           n_fft=n_fft,
                                           f_max=8000)
        self.AtoDB = AmplitudeToDB(top_db=top_db)
        self.max_perc = max_perc
        self.sample_rate = sample_rate
        self.resamples = [
            Resample(sample_rate, sample_rate * 0.6),
            Resample(sample_rate, sample_rate * 0.7),
            Resample(sample_rate, sample_rate * 0.8),
            Resample(sample_rate, sample_rate * 0.9),
            Resample(sample_rate, sample_rate * 1),
            Resample(sample_rate, sample_rate * 1.1),
            Resample(sample_rate, sample_rate * 1.2),
            Resample(sample_rate, sample_rate * 1.3),
            Resample(sample_rate, sample_rate * 1.4)
        ]

    def forward(self, x, train):
        x = random.choice(self.resamples)(x)

        x = self.stft(x)

        if train:
            dist = Uniform(1. - self.max_perc, 1 + self.max_perc)
            x = self.time_stretch(x, dist.sample().item())
            x = self.com_norm(x)
            x = self.fm(x, 0)
            x = self.tm(x, 0)
        else:
            x = self.com_norm(x)

        x = self.mel_specgram.mel_scale(x)
        x = self.AtoDB(x)

        size = torch.tensor(x.size())

        if size[3] > 157:
            x = x[:, :, :, 0:157]
        else:
            x = torch.cat([
                x,
                torch.cuda.FloatTensor(size[0], size[1], size[2],
                                       157 - size[3]).fill_(0)
            ],
                          dim=3)

        return x
Exemple #21
0
class MelspectrogramStretch(object):
    def __init__(self):

        sample_rate = 44100
        num_mels = 128
        fft_length = 2048
        hop_length = fft_length // 2

        self.stft = Spectrogram(n_fft=fft_length,
                                win_length=fft_length,
                                hop_length=None,
                                pad=0,
                                power=None,
                                normalized=False)

        self.mst = MelSpectrogram(sample_rate=sample_rate,
                                  n_fft=fft_length,
                                  hop_length=hop_length,
                                  n_mels=num_mels)

        # Normalization (pot spec processing)
        self.complex_norm = ComplexNorm(power=2.)

    def forward(self, data):
        tsf = AudioTransforms()
        sig_t, sr, _ = tsf.apply(data, None)

        length = torch.tensor(sig_t.size(0))
        sr = torch.tensor(sr)
        data = [d.unsqueeze(0).to("cpu") for d in [sig_t, length, sr]]

        # x-> (batch, time, channel)
        x, lengths, _ = data  # unpacking seqs, lengths and srs
        # x-> (batch, channel, time)
        xt = x.float().transpose(1, 2)
        # xt -> (batch, channel, freq, time)
        x = self.stft(xt)
        # x -> (fft_length//2+1,bins,channel)

        #print(x.shape)  #torch.Size([1, 1, 1025, 173, 2])
        x = self.complex_norm(x)
        #print(x.shape)  #torch.Size([1, 1, 1025, 173])
        x = self.mst.mel_scale(x)
        #print(x.shape)  #torch.Size([1, 1, 128, 173])

        # Normalize melspectrogram
        # Independent mean, std per batch
        non_batch_inds = [1, 2, 3]
        mean = x.mean(non_batch_inds, keepdim=True)
        std = x.std(non_batch_inds, keepdim=True)
        x = (x - mean) / std

        x = x.to('cpu').detach().numpy().copy()

        lengths = [x.shape[3]]
        return x, lengths
Exemple #22
0
 def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels):
     self._factory = MelSpectrogram(
         sample_rate=sample_rate,
         n_fft=n_fft,
         win_length=win_length,
         hop_length=hop_length,
         n_mels=n_mels,
         f_min=0.0,  # TacotronSTFT.mel_fmin == PyTorch default
         f_max=8000.0,  # TacotronSTFT.mel_fmax, default PyTorch is None
     )
Exemple #23
0
    def __init__(self, sample_rate, n_fft, hop_length, n_mels, top_db=None):
        super().__init__()

        self.mel_spectrogram = MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
        )
        self.amplitude_to_db = AmplitudeToDB(top_db=top_db)
    def __init__(self):
        super(SampleMel, self).__init__()
        self.mfcc = MelSpectrogram(sample_rate=8000)

        self.sb1 = SampleBlock(in_unit=1, stride=3)
        self.sb2 = SampleBlock()
        self.sb3 = SampleBlock()
        self.sb4 = SampleBlock()
        self.sb5 = SampleBlock()
        self.sb6 = SampleBlock()
        self.sb7 = SampleBlock()
def extract_audio_features_v2_chunks(filename):
    import torch
    effects = [['remix', '2'], ['rate', str(SAMPLE_RATE)]]
    waveform, _ = torchaudio.sox_effects.apply_effects_file(filename, effects)
    ms = MelSpectrogram(sample_rate=SAMPLE_RATE,
                        n_fft=WINDOW_SIZE,
                        hop_length=WINDOW_STRIDE,
                        n_mels=N_MELS)(waveform)[0]
    ms[ms == 0] = 1e6
    return torch.tensor(
        split_equal_chunks(torch.log(ms).tolist(), MINUTE_LENGTH)),
 def __init__(
         self,
         #device: torch.device,
         #learning_rate: float,
         #scheduler_step_size: int,
         #scheduler_gamma: float,
         #verbose: bool,
     ):
     super().__init__()
     self.ms = MelSpectrogram()
     from torch.nn import Linear
     self.l = Linear(12, 12)
 def __init__(self, alpha=0.5):
     super(DeepSupervisedMel, self).__init__()
     self.mel = MelSpectrogram(sample_rate=8000)
     self.alpha = alpha
     self.sb1 = SampleBlock(in_unit=1, stride=3)
     self.sb2 = SampleBlock()
     self.sb3 = SampleBlock()
     self.sb4 = SampleBlock()
     self.sb5 = SampleBlock()
     self.sb6 = SampleBlock()
     self.sb7 = SampleBlock()
     self.wn = WavenetVocoder()
Exemple #28
0
    def __init__(self):

        sample_rate = 44100
        num_mels = 128
        fft_length = 2048
        hop_length = fft_length // 2

        self.stft = Spectrogram(n_fft=fft_length,
                                win_length=fft_length,
                                hop_length=None,
                                pad=0,
                                power=None,
                                normalized=False)

        self.mst = MelSpectrogram(sample_rate=sample_rate,
                                  n_fft=fft_length,
                                  hop_length=hop_length,
                                  n_mels=num_mels)

        # Normalization (pot spec processing)
        self.complex_norm = ComplexNorm(power=2.)
Exemple #29
0
def create_mel_tensors(path):
    audio_dir = [f for f in os.listdir(path) if '.wav' in f]
    direct = './data/'
    for aud in audio_dir:
        pat = aud[0]
        path2 = direct + pat.replace('_', '') + '/'
        waveform, sample_rate = torchaudio.load(os.path.join(path, aud))

        if not os.path.exists(path2):
            os.mkdir(path2)
        mel_spec = MelSpectrogram(sample_rate)(waveform)
        torch.save(mel_spec, path2+ aud.replace('.wav', '.pt'))
Exemple #30
0
    def __init__(self, sample_rate: int, mel_size: int, n_fft: int, win_length: int,
                 hop_length: int, min_db: float, max_db: float,
                 mel_min: float = 0., mel_max: float = None):
        super().__init__()
        self.mel_size = mel_size
        # db to log
        self.min_db = np.log(np.power(10, min_db / 10))
        self.max_db = np.log(np.power(10, max_db / 10))

        self.melfunc = MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, win_length=win_length,
                                      hop_length=hop_length, f_min=mel_min, f_max=mel_max, n_mels=mel_size,
                                      window_fn=torch.hann_window)