def __call__(self, x, sr): #Draw reverb, snr and pitch. flips = np.random.binomial(1, p=self.p, size=3) if flips[0]: #Add reverb reverb = np.random.randint(0, self.reverb) x = augment.EffectChain().reverb(reverb, reverb, reverb).channels(2).apply( x, src_info={'rate': sr}) if flips[1]: #Add noise noise_generator = lambda: torch.zeros_like(x).uniform_() x = augment.EffectChain().additive_noise(noise_generator, snr=self.snr).apply( x, src_info={'rate': sr}) if flips[2]: #Add pitch - PITCH CAN SOMEHOW CHANGE THE SHAPE. NOT SURE HOW, WHY ETC. pitch = np.random.randint(-self.pitch, self.pitch) x = augment.EffectChain().pitch(pitch).rate(sr).apply( x, src_info={'rate': sr}) return x, sr
def test_additive_noise(): x, sr = torchaudio.load(test_wav) noise = torch.zeros_like(x) src_info = { 'channels': 1, 'length': x.size(1), 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32 } target_info = { 'channels': 1, 'length': 0, 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32 } y = augment.EffectChain() \ .additive_noise(noise_generator=lambda: x, snr=10.0) \ .apply(x, src_info=src_info, target_info=target_info) assert torch.allclose(x, y)
def test_non_empty_chain(): x, sr = torchaudio.load(test_wav) src_info = { 'channels': 1, 'length': x.size(1), 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32 } target_info = { 'channels': 1, 'length': 0, 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32 } effects = augment.EffectChain().bandreject(1, 20000) y = effects.apply(x, src_info=src_info, target_info=target_info) assert x.size() == y.size(), f'{y.size()}' assert not x.allclose(y)
def augmentation_factory(description, sampling_rate, args): chain = augment.EffectChain() description = description.split(',') for effect in description: if effect == 'bandreject': chain = chain.sinc( '-a', '120', SpecAugmentBand(sampling_rate, args.band_scaler)) elif effect == 'pitch': pitch_randomizer = RandomPitchShift(args.pitch_shift_max) if args.pitch_quick: chain = chain.pitch('-q', pitch_randomizer).rate( '-q', sampling_rate) else: chain = chain.pitch(pitch_randomizer).rate(sampling_rate) elif effect == 'reverb': randomized_params = RandomReverb( args.reverberance_min, args.reverberance_max, args.damping_min, args.damping_max, args.room_scale_min, args.room_scale_max) chain = chain.reverb(randomized_params).channels() elif effect == 'time_drop': chain = chain.time_dropout(max_seconds=args.t_ms / 1000.0) elif effect == 'clip': chain = chain.clip(RandomClipFactor(args.clip_min, args.clip_max)) elif effect == 'none': pass else: raise RuntimeError(f'Unknown augmentation type {effect}') return chain
def test_stochastic_pitch(): x, sr = torchaudio.load(test_wav) assert sr == 16000 src_info = { 'channels': x.size(0), 'length': x.size(1), 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32 } target_info = { 'channels': 1, 'length': 0, 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32 } def random_pitch(): return np.random.randint(100, 500) y = augment.EffectChain().pitch(random_pitch).rate(16000).apply( x, src_info=src_info, target_info=target_info) assert not torch.allclose(x, y, rtol=1e-3, atol=1e-3)
def reverb(*args, **kwargs): """ Returns a reverb effect for wav augmentation. """ import augment effect_chain = augment.EffectChain() # Reverb it makes the signal to have two channels, # which we combine into 1 by running `channels` w/o parameters effect_chain.reverb(50, 50, _random_room_size).channels() return effect_chain
def test_bandreject(): y1, _ = run_sox_command(test_wav, ["sinc", "-a", "120", "2000-1000"]) chain = augment.EffectChain().sinc("-a", "120", "2000-1000") y2 = apply_chain(test_wav, chain) assert y1.size() == y2.size() # NB: higher tolerance due to all the discretization done on save/load assert torch.allclose(y1, y2, rtol=1e-4, atol=1e-4)
def test_reverb(): y1, _ = run_sox_command(test_wav, ["reverb", "50", "50", "100"]) chain = augment.EffectChain().reverb(50, 50, 100).channels() y2 = apply_chain(test_wav, chain) assert y1.size() == y2.size() # NB: higher tolerance due to all the discretization done on save/load assert torch.allclose(y1, y2, rtol=1e-4, atol=1e-4)
def test_pitch(): y1, _ = run_sox_command(test_wav, ["pitch", "-100"]) chain = augment.EffectChain().pitch(-100).rate(16000) y2 = apply_chain(test_wav, chain) assert y1.size() == y2.size() # NB: higher tolerance due to all the discretization done on save/load assert torch.allclose(y1, y2, rtol=1e-4, atol=1e-4)
def pitch(sampling_rate: int): """ Returns a pitch modification effect for wav augmentation. :param sampling_rate: a sampling rate value for which the effect will be created (resampling is needed for pitch). """ import augment effect_chain = augment.EffectChain() # The pitch effect changes the sampling ratio; we have to compensate for that. # Here, we specify 'quick' options on both pitch and rate effects, to speed up things effect_chain.pitch("-q", _random_pitch_shift).rate("-q", sampling_rate) return effect_chain
def test_empty_chain(): x = torch.arange(0, 8000).float() src_info = {'channels': 1, 'length': x.size(0), 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32} target_info = {'channels': 1, 'length': 0, 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32} y = augment.EffectChain().apply( x, src_info=src_info, target_info=target_info) assert x.view(-1).allclose(y.view(-1))
def forward(self, audio): reverberance = torch.randint(self.reverberance_min, self.reverberance_max, size=(1, )).item() dumping_factor = torch.randint(self.dumping_factor_min, self.dumping_factor_max, size=(1, )).item() room_size = torch.randint(self.room_size_min, self.room_size_max, size=(1, )).item() effect_chain = (augment.EffectChain().reverb(reverberance, dumping_factor, room_size).channels(1)) audio = effect_chain.apply(audio, src_info=self.src_info, target_info=self.target_info) return audio
def pitch_reverb_tdrop(sampling_rate: int): """ Returns an effect chain composed of pitch modification, reverberation and time dropout proposed in: * https://github.com/facebookresearch/WavAugment/blob/master/examples/python/librispeech_selfsupervised.py#L152 * https://arxiv.org/abs/2007.00991 :param sampling_rate: a sampling rate value for which the effect will be created (resampling is needed for pitch). """ import augment effect_chain = augment.EffectChain() # The pitch effect changes the sampling ratio; we have to compensate for that. # Here, we specify 'quick' options on both pitch and rate effects, to speed up things effect_chain.pitch("-q", _random_pitch_shift).rate("-q", sampling_rate) # Next effect we add is `reverb`; it adds makes the signal to have two channels, # which we combine into 1 by running `channels` w/o parameters effect_chain.reverb(50, 50, _random_room_size).channels() # Futher, we add an effect that randomly drops one 50ms subsequence effect_chain.time_dropout(max_seconds=50 / 1000) return effect_chain
def convert_pitch_augment(test_wav): x, sr = torchaudio.load(test_wav) assert sr == 16000 src_info = {'channels': x.size(0), 'length': x.size(1), 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32} target_info = {'channels': 1, 'length': 0, 'precision': 32, 'rate': 16000.0, 'bits_per_sample': 32} y = augment.EffectChain().pitch(100).rate(16000).apply( x, src_info=src_info, target_info=target_info) return y, sr
def __call__(self, audio): n_steps = random.randint(self.pitch_cents_min, self.pitch_cents_max) effect_chain = augment.EffectChain().pitch(n_steps).rate(self.sample_rate) y = effect_chain.apply( audio, src_info=self.src_info, target_info=self.target_info ) # sox might misbehave sometimes by giving nan/inf if sequences are too short (or silent) # and the effect chain includes eg `pitch` if torch.isnan(y).any() or torch.isinf(y).any(): return audio.clone() if y.shape[1] != audio.shape[1]: if y.shape[1] > audio.shape[1]: y = y[:, audio.shape[1]] else: y0 = torch.zeros(1, audio.shape[1]) y0[:, :y.shape[1]] = y y = y0 return y
def __init__( self, manifest_path, sample_rate, max_sample_size=None, min_sample_size=None, shuffle=True, min_length=0, pad=False, normalize=False, ): super(AugmentedFileAudioDataset, self).__init__( manifest_path=manifest_path, sample_rate=sample_rate, max_sample_size=max_sample_size, min_sample_size=min_sample_size, shuffle=shuffle, min_length=min_length, pad=pad, normalize=normalize, ) self.pre_transform = Compose([ #AddGaussianNoise(min_amplitude=1e-3, max_amplitude=5e-2, p=0.8), #PitchShift(min_semitones=-4, max_semitones=4, p=0.8), FrequencyMask(min_frequency_band=0.0, max_frequency_band=0.05, p=0.5), TimeMask(min_band_part=0.0, max_band_part=0.05, p=0.5) #ClippingDistortion(min_percentile_threshold=10, max_percentile_threshold=40, p=0.2), ]) random_reverb = RandomReverb() random_clip = RandomClip() random_time_dropout = RandomTimeDropout() self.post_transform = augment.EffectChain().reverb( random_reverb).channels(1).clip(random_clip) #.time_dropout(200)
"train-clean-360", "train-other-500"], default='dev-clean', help='Librispeech subset to use') parser.add_argument('--sequence_length_seconds', type=int, default=1, help='Sample sequence length') parser.add_argument('--batch_size', type=int, default=32, help="Batch size") parser.add_argument('--n_workers', type=int, default=8, help="Number of parallel workers to read/preprocess data") parser.add_argument('--n_epochs', type=int, default=3, help="Number of epochs to run") parser.add_argument('--dump', action="store_true", help="Dump examples of (non)augmented sequences." "They would be saved in 'original.wav' and 'augmented.wav'") args = parser.parse_args() return args if __name__ == '__main__': args = get_args() effect_chain_past = augment.EffectChain() # The pitch effect changes the sampling ratio; we have to compensate for that. # Here, we specify 'quick' options on both pitch and rate effects, to speed up things effect_chain_past.pitch("-q", random_pitch_shift).rate("-q", 16_000) # Next effect we add is `reverb`; it adds makes the signal to have two channels, # which we combine into 1 by running `channels` w/o parameters effect_chain_past.reverb(50, 50, random_room_size).channels() # Futher, we add an effect that randomly drops one 50ms subsequence effect_chain_past.time_dropout(max_seconds=50 / 1000) effect_chain_past_runner = ChainRunner(effect_chain_past) # the second, `future` copy would be non-augmented effect_chain_future = None effect_chain_future_runner = None