def __init__(self, data_dir='/home/syl20/data/en/librispeech', train_set='train-clean-5', val_set='dev-clean-2', test_set='dev-clean-2', batch_size=64, num_workers=60, sample_rate=16000, n_mels=128, freq_mask_param=15, time_mask_param=35): super().__init__() self.data_dir = data_dir self.train_set = train_set self.val_set = val_set self.test_set = test_set self.batch_size = batch_size self.num_workers = num_workers self.sample_rate = sample_rate self.n_mels = n_mels self.freq_mask_param = freq_mask_param self.time_mask_param = time_mask_param self.val_transform = MelSpectrogram(sample_rate=self.sample_rate, n_mels=self.n_mels) self.train_transform = nn.Sequential( MelSpectrogram(sample_rate=self.sample_rate, n_mels=self.n_mels), FrequencyMasking(freq_mask_param=self.freq_mask_param), TimeMasking(time_mask_param=self.time_mask_param))
def __init__(self, sample_rate=44100, n_fft=int(400 / 16000 * 44100)): super().__init__() self.spec_transform = MelSpectrogram(n_mels=80, sample_rate=sample_rate, n_fft=n_fft) self.vtlp_transform = apply_vtlp( MelSpectrogram(n_mels=80, sample_rate=sample_rate, n_fft=n_fft)) self.delta_transform = ComputeDeltas()
def collate_fn(data, device=device): data = torch.stack(data) x = MelSpectrogram(sample_rate=sample_rate)(data) x = AmplitudeToDB(stype='power', top_db=80)(x) maxval = x.max() minval = x.min() x = (x-minval)/(maxval - minval) return x
def __init__(self, sample_rate, n_fft, top_db, max_perc): super().__init__() self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1) self.stft = Spectrogram(n_fft=n_fft, power=None) self.com_norm = ComplexNorm(power=2.) self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000) self.AtoDB = AmplitudeToDB(top_db=top_db) self.dist = Uniform(1. - max_perc, 1 + max_perc)
def __init__(self, sample_rate: int, mel_size: int, n_fft: int, win_length: int, hop_length: int, mel_min: float = 0., mel_max: float = None): super().__init__() self.mel_size = mel_size self.melfunc = MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, win_length=win_length, hop_length=hop_length, f_min=mel_min, f_max=mel_max, n_mels=mel_size, window_fn=torch.hann_window)
def test_melspectrogram(self, track_ids: Tensor): def file_exists(track_id: int) -> bool: return os.path.isfile(get_audio_path_default(track_id)) def get_melspectrogram_torchaudio(track_id: int) -> Tensor: path = get_audio_path_default(track_id) effects = [ ['remix', '2'], ['rate', str(SAMPLE_RATE)], ] waveform, _ = torchaudio.sox_effects.apply_effects_file( path, effects) return transform(waveform)[0] def get_melspectrogram_librosa(track_id: int) -> Tensor: new_input, sample_rate = librosa.load( get_audio_path_default(track_id)) return torch.tensor( librosa.feature.melspectrogram(new_input, **MEL_KWARGS)) transform = MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=WINDOW_SIZE, hop_length=WINDOW_STRIDE, n_mels=N_MELS) for batch in track_ids: for track_id in batch: if not file_exists(track_id): continue melspectrogram_torchaudio = get_melspectrogram_torchaudio( track_id) melspectrogram_librosa = get_melspectrogram_librosa(track_id) self.assertEqual(melspectrogram_torchaudio.size(), melspectrogram_librosa.size())
def __init__(self, sample_rate: int = 16000, input_stack_rate: int = 1, model_stack_rate: int = 1, max_frames: int = 3000, target_tokenizer: Tokenizer = None, target_token_indexers: Dict[str, TokenIndexer] = None, target_add_start_end_token: bool = False, lazy: bool = False) -> None: super().__init__(lazy) self._target_tokenizer = target_tokenizer or WordTokenizer() self._target_token_indexers = target_token_indexers or { "tokens": SingleIdTokenIndexer() } self.input_stack_rate = input_stack_rate self.model_stack_rate = model_stack_rate self.stack_rate = input_stack_rate * model_stack_rate self._target_add_start_end_token = target_add_start_end_token self._pad_mode = "wrap" if input_stack_rate == 1 else "constant" self._max_frames = max_frames self._epoch_num = 0 self._sample_rate = sample_rate win_length = int(sample_rate * 0.025) hop_length = int(sample_rate * 0.01) n_fft = win_length self._mel_spectrogram = MelSpectrogram(sample_rate, n_fft, win_length=win_length, hop_length=hop_length, n_mels=80)
def __init__(self, sample_rate: int = 16000, n_mels: int = 40, masking=True): super(LogMelSpectrogram, self).__init__() self.transform = MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels, n_fft=1024, hop_length=256, f_min=0, f_max=8000) self.masking=masking if masking: self.freq_masking = FrequencyMasking(freq_mask_param=10) self.time_masking = TimeMasking(time_mask_param=30)
def main(args): device = "cuda" if torch.cuda.is_available() else "cpu" waveform, sample_rate, _, _ = LJSPEECH("./", download=True)[0] mel_kwargs = { 'sample_rate': sample_rate, 'n_fft': 2048, 'f_min': 40., 'n_mels': 80, 'win_length': 1100, 'hop_length': 275, 'mel_scale': 'slaney', 'norm': 'slaney', 'power': 1, } transforms = torch.nn.Sequential( MelSpectrogram(**mel_kwargs), NormalizeDB(min_level_db=-100, normalization=True), ) mel_specgram = transforms(waveform) wavernn_model = wavernn(args.checkpoint_name).eval().to(device) wavernn_inference_model = WaveRNNInferenceWrapper(wavernn_model) if args.jit: wavernn_inference_model = torch.jit.script(wavernn_inference_model) with torch.no_grad(): output = wavernn_inference_model(mel_specgram.to(device), mulaw=(not args.no_mulaw), batched=(not args.no_batch_inference), timesteps=args.batch_timesteps, overlap=args.batch_overlap,) torchaudio.save(args.output_wav_path, output, sample_rate=sample_rate)
def __init__(self, sample_rate=16000, n_fft=401, hop_length=256, n_mels=23, context_size=7, subsample=16): super(LogMel, self).__init__() self.stft = MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels) self.pad = nn.ReplicationPad1d(padding=context_size) self.context_size = context_size self.subsample = subsample
class RondomStretchMelSpectrogram(nn.Module): def __init__(self, sample_rate, n_fft, top_db, max_perc): super().__init__() self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1) self.stft = Spectrogram(n_fft=n_fft, power=None) self.com_norm = ComplexNorm(power=2.) self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000) self.AtoDB = AmplitudeToDB(top_db=top_db) self.dist = Uniform(1. - max_perc, 1 + max_perc) def forward(self, x, train): x = self.stft(x) if train: x = self.time_stretch(x, self.dist.sample().item()) x = self.com_norm(x) x = self.mel_specgram.mel_scale(x) x = self.AtoDB(x) size = torch.tensor(x.size()) if size[3] > 157: x = x[:, :, :, 0:157] else: x = torch.cat([ x, torch.cuda.FloatTensor(size[0], size[1], size[2], 157 - size[3]).fill_(0) ], dim=3) return x
def __init__(self, sample_rate=20000, use_spectrogram=False, window_size=512, hop_length=256, n_fft=None, pad=0, n_mels=40, root='data/chopped', n_files=None): self.root = root self.files = os.listdir(root) # Mel self.use_spectrogram = use_spectrogram self.sample_rate = sample_rate self.window_size = window_size self.hop_length = hop_length self.n_fft = n_fft self.pad = pad self.n_mels = n_mels self.mel_spec = MelSpectrogram(sr=self.sample_rate, ws=self.window_size, hop=self.hop_length, n_fft=self.n_fft, pad=self.pad, n_mels=self.n_mels)
def __init__(self, output_class=264, d_size=256, sample_rate=32000, n_fft=2**11, top_db=80): super().__init__() self.mel = MelSpectrogram(sample_rate, n_fft=n_fft) self.norm_db = AmplitudeToDB(top_db=top_db) self.conv1 = nn.Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1)) self.bn1 = nn.BatchNorm2d(32) self.relu = nn.ReLU(0.1) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=3) self.dropout = nn.Dropout(0.1) self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) self.bn2 = nn.BatchNorm2d(64) self.relu2 = nn.ReLU(0.1) self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=3) self.dropout2 = nn.Dropout(0.1) self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1)) self.bn3 = nn.BatchNorm2d(128) self.relu3 = nn.ReLU(0.1) self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=3) self.dropout3 = nn.Dropout(0.1) self.lstm = nn.LSTM(12, 128, 2, batch_first=True) self.dropout_lstm = nn.Dropout(0.3) self.bn_lstm = nn.BatchNorm1d(128) self.output = nn.Linear(128, output_class)
def spectrogram(trace: Trace): trace.resample(sampling_rate) mel_spec = MelSpectrogram(sample_rate=sampling_rate, n_mels=image_height, hop_length=hop_length, power=1, pad_mode='reflect', normalized=True) amplitude_to_db = AmplitudeToDB() # trace = trace.detrend('linear') # trace = trace.detrend('demean') trace.data = trace.data - np.mean(trace.data) trace = trace.taper(max_length=0.01, max_percentage=0.05) trace = trace.trim(starttime=trace.stats.starttime, endtime=trace.stats.starttime + sequence_length_second, pad=True, fill_value=0) data = trace.data torch_data = torch.tensor(data).type(torch.float32) spec = (mel_spec(torch_data)) spec_db = amplitude_to_db(spec.abs() + 1e-3) spec_db = (spec_db - spec_db.min()).numpy() # spec_db = (spec_db / spec_db.max()).type(torch.float32) return spec_db
def __init__(self, x_shape, sr=44100, n_fft=1024, n_mels=256, win_len=256, hop_len=128): super(ProcessMelSpectrogram, self).__init__() # og spectrogram process: sr 11025, n_fft 1024, n_mels 256, win_len 256, hop_len 8 # og output shape 256, 92 # librosa default params: sr 22050, n_fft 2048, n_mels ?, win_len 2048, hop_len 512 # music processing: 93 ms, speech processing: 23 ms (computed by 1/(sr/hop_len)) self.mel_s = MelSpectrogram(sample_rate=sr, n_fft=n_fft, n_mels=n_mels, win_length=win_len, hop_length=hop_len) self.a_to_db = AmplitudeToDB(top_db=80) self.x_shape = [-1] + list(x_shape) assert len(self.x_shape) in [2, 3, 4] num_samples = np.prod(self.x_shape[1:]) spec_width = num_samples // hop_len + (num_samples % hop_len > 0) self.output_shape = [self.x_shape[0], 1, n_mels, spec_width]
def __init__(self, output_class=264, d_size=256, sample_rate=32000, n_fft=2**11, top_db=80): super().__init__() self.mel = MelSpectrogram(sample_rate, n_fft=n_fft) self.norm_db = AmplitudeToDB(top_db=top_db) self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0]) self.bn1 = nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) self.relu = nn.ReLU(0.1) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False) self.dropout = nn.Dropout(0.1) self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0]) self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) self.relu2 = nn.ReLU(0.1) self.maxpool2 = nn.MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False) self.dropout2 = nn.Dropout(0.1) self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0]) self.bn3 = nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) self.relu3 = nn.ReLU(0.1) self.maxpool3 = nn.MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False) self.dropout3 = nn.Dropout(0.1) self.lstm = nn.LSTM(4, 128, 2, batch_first=True) self.dropout_lstm = nn.Dropout(0.3) self.bn_lstm = nn.BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) self.output = nn.Linear(128, output_class, bias=True)
def __init__(self): super(MelnnAudio, self).__init__() self.ta = MelSpectrogram(sample_rate=8000) self.nna = nnASpectrogram.MelSpectrogram(sr=8000, n_fft=400, device='cpu', norm=None) self.mask = torch.nn.Parameter(torch.ones([128, 81]))
def __init__(self, arch, num_classes=10): super(ModelCalled, self).__init__() self.melspectrogram = MelSpectrogram(sample_rate=16384, # 与FolderDataset.duration一起,使得mel图的shape=(128, 128),记住设置fmax=8000 n_fft=2048, hop_length=512, f_max=8000, n_mels=128) self.power2db = AmplitudeToDB(stype='power') self.model = Models.__dict__[arch](num_classes=num_classes)
def __init__(self, settings: AudioTransformSettings = AudioTransformSettings()): super().__init__() if settings.use_meyda_spectrogram: self.spec_transform = MeydaMelSpectrogram(n_mels=settings.num_mels, sample_rate=settings.sample_rate, n_fft=settings.num_fft, hop_length=settings.hop_length) else: self.spec_transform = MelSpectrogram(n_mels=settings.num_mels, sample_rate=settings.sample_rate, n_fft=settings.num_fft, hop_length=settings.hop_length) self.vtlp_transform = apply_vtlp(MelSpectrogram(n_mels=settings.num_mels, sample_rate=settings.sample_rate, n_fft=settings.num_fft, hop_length=settings.hop_length)) self.delta_transform = ComputeDeltas()
class RondomStretchMelSpectrogram(nn.Module): def __init__(self, sample_rate, n_fft, top_db, max_perc): super().__init__() self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1) self.stft = Spectrogram(n_fft=n_fft, power=None) self.com_norm = ComplexNorm(power=2.) self.fm = FrequencyMasking(100) self.tm = TimeMasking(100) self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000) self.AtoDB = AmplitudeToDB(top_db=top_db) self.max_perc = max_perc self.sample_rate = sample_rate self.resamples = [ Resample(sample_rate, sample_rate * 0.6), Resample(sample_rate, sample_rate * 0.7), Resample(sample_rate, sample_rate * 0.8), Resample(sample_rate, sample_rate * 0.9), Resample(sample_rate, sample_rate * 1), Resample(sample_rate, sample_rate * 1.1), Resample(sample_rate, sample_rate * 1.2), Resample(sample_rate, sample_rate * 1.3), Resample(sample_rate, sample_rate * 1.4) ] def forward(self, x, train): x = random.choice(self.resamples)(x) x = self.stft(x) if train: dist = Uniform(1. - self.max_perc, 1 + self.max_perc) x = self.time_stretch(x, dist.sample().item()) x = self.com_norm(x) x = self.fm(x, 0) x = self.tm(x, 0) else: x = self.com_norm(x) x = self.mel_specgram.mel_scale(x) x = self.AtoDB(x) size = torch.tensor(x.size()) if size[3] > 157: x = x[:, :, :, 0:157] else: x = torch.cat([ x, torch.cuda.FloatTensor(size[0], size[1], size[2], 157 - size[3]).fill_(0) ], dim=3) return x
class MelspectrogramStretch(object): def __init__(self): sample_rate = 44100 num_mels = 128 fft_length = 2048 hop_length = fft_length // 2 self.stft = Spectrogram(n_fft=fft_length, win_length=fft_length, hop_length=None, pad=0, power=None, normalized=False) self.mst = MelSpectrogram(sample_rate=sample_rate, n_fft=fft_length, hop_length=hop_length, n_mels=num_mels) # Normalization (pot spec processing) self.complex_norm = ComplexNorm(power=2.) def forward(self, data): tsf = AudioTransforms() sig_t, sr, _ = tsf.apply(data, None) length = torch.tensor(sig_t.size(0)) sr = torch.tensor(sr) data = [d.unsqueeze(0).to("cpu") for d in [sig_t, length, sr]] # x-> (batch, time, channel) x, lengths, _ = data # unpacking seqs, lengths and srs # x-> (batch, channel, time) xt = x.float().transpose(1, 2) # xt -> (batch, channel, freq, time) x = self.stft(xt) # x -> (fft_length//2+1,bins,channel) #print(x.shape) #torch.Size([1, 1, 1025, 173, 2]) x = self.complex_norm(x) #print(x.shape) #torch.Size([1, 1, 1025, 173]) x = self.mst.mel_scale(x) #print(x.shape) #torch.Size([1, 1, 128, 173]) # Normalize melspectrogram # Independent mean, std per batch non_batch_inds = [1, 2, 3] mean = x.mean(non_batch_inds, keepdim=True) std = x.std(non_batch_inds, keepdim=True) x = (x - mean) / std x = x.to('cpu').detach().numpy().copy() lengths = [x.shape[3]] return x, lengths
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels): self._factory = MelSpectrogram( sample_rate=sample_rate, n_fft=n_fft, win_length=win_length, hop_length=hop_length, n_mels=n_mels, f_min=0.0, # TacotronSTFT.mel_fmin == PyTorch default f_max=8000.0, # TacotronSTFT.mel_fmax, default PyTorch is None )
def __init__(self, sample_rate, n_fft, hop_length, n_mels, top_db=None): super().__init__() self.mel_spectrogram = MelSpectrogram( sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, ) self.amplitude_to_db = AmplitudeToDB(top_db=top_db)
def __init__(self): super(SampleMel, self).__init__() self.mfcc = MelSpectrogram(sample_rate=8000) self.sb1 = SampleBlock(in_unit=1, stride=3) self.sb2 = SampleBlock() self.sb3 = SampleBlock() self.sb4 = SampleBlock() self.sb5 = SampleBlock() self.sb6 = SampleBlock() self.sb7 = SampleBlock()
def extract_audio_features_v2_chunks(filename): import torch effects = [['remix', '2'], ['rate', str(SAMPLE_RATE)]] waveform, _ = torchaudio.sox_effects.apply_effects_file(filename, effects) ms = MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=WINDOW_SIZE, hop_length=WINDOW_STRIDE, n_mels=N_MELS)(waveform)[0] ms[ms == 0] = 1e6 return torch.tensor( split_equal_chunks(torch.log(ms).tolist(), MINUTE_LENGTH)),
def __init__( self, #device: torch.device, #learning_rate: float, #scheduler_step_size: int, #scheduler_gamma: float, #verbose: bool, ): super().__init__() self.ms = MelSpectrogram() from torch.nn import Linear self.l = Linear(12, 12)
def __init__(self, alpha=0.5): super(DeepSupervisedMel, self).__init__() self.mel = MelSpectrogram(sample_rate=8000) self.alpha = alpha self.sb1 = SampleBlock(in_unit=1, stride=3) self.sb2 = SampleBlock() self.sb3 = SampleBlock() self.sb4 = SampleBlock() self.sb5 = SampleBlock() self.sb6 = SampleBlock() self.sb7 = SampleBlock() self.wn = WavenetVocoder()
def __init__(self): sample_rate = 44100 num_mels = 128 fft_length = 2048 hop_length = fft_length // 2 self.stft = Spectrogram(n_fft=fft_length, win_length=fft_length, hop_length=None, pad=0, power=None, normalized=False) self.mst = MelSpectrogram(sample_rate=sample_rate, n_fft=fft_length, hop_length=hop_length, n_mels=num_mels) # Normalization (pot spec processing) self.complex_norm = ComplexNorm(power=2.)
def create_mel_tensors(path): audio_dir = [f for f in os.listdir(path) if '.wav' in f] direct = './data/' for aud in audio_dir: pat = aud[0] path2 = direct + pat.replace('_', '') + '/' waveform, sample_rate = torchaudio.load(os.path.join(path, aud)) if not os.path.exists(path2): os.mkdir(path2) mel_spec = MelSpectrogram(sample_rate)(waveform) torch.save(mel_spec, path2+ aud.replace('.wav', '.pt'))
def __init__(self, sample_rate: int, mel_size: int, n_fft: int, win_length: int, hop_length: int, min_db: float, max_db: float, mel_min: float = 0., mel_max: float = None): super().__init__() self.mel_size = mel_size # db to log self.min_db = np.log(np.power(10, min_db / 10)) self.max_db = np.log(np.power(10, max_db / 10)) self.melfunc = MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, win_length=win_length, hop_length=hop_length, f_min=mel_min, f_max=mel_max, n_mels=mel_size, window_fn=torch.hann_window)