rate=SAMPLE_RATE, data=augmented_samples) # AddGaussianNoise augmenter = Compose( [AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "AddGaussianNoise_{:03d}.wav".format(i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # TimeStretch augmenter = Compose([TimeStretch(min_rate=0.8, max_rate=1.25, p=1.0)]) for i in range(5): output_file_path = os.path.join(output_dir, "TimeStretch_{:03d}.wav".format(i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # PitchShift augmenter = Compose([PitchShift(min_semitones=-4, max_semitones=4, p=1.0)]) for i in range(5): output_file_path = os.path.join(output_dir, "PitchShift_{:03d}.wav".format(i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path,
"num_runs": 5 }, { "instance": Shift(min_fraction=-0.5, max_fraction=0.5, rollover=False, p=1.0), "num_runs": 5, "name": "ShiftWithoutRollover", }, { "instance": TimeMask(p=1.0), "num_runs": 5 }, { "instance": TimeStretch(min_rate=0.8, max_rate=1.25, p=1.0), "num_runs": 5 }, { "instance": Trim(p=1.0), "num_runs": 1 }, ] for sound_file_path in sound_file_paths: samples, sample_rate = load_sound_file(sound_file_path, sample_rate=None, mono=False) if len(samples.shape) == 2 and samples.shape[0] > samples.shape[1]: samples = samples.transpose()
from python_speech_features import mfcc import scipy.io.wavfile as wav import matplotlib.pyplot as plt from matplotlib import cm import numpy as np import os import random from tqdm import tqdm from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift augmenter = Compose([ AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5), TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5), PitchShift(min_semitones=-4, max_semitones=4, p=0.5), Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5), ]) def load_noise(path='/home/CAIL/Speaker_R/data/voice/background_noise/'): noise = [] files = os.listdir(path) for f in files: filename = f if ('wav' not in filename): continue f = os.path.join(path, f) (rate, sig) = wav.read(f) noise.append(sig) return noise
def applyTransformations(fileName, output_dir, auxiliarSoundsDir): name = fileName.split(".")[0].split("/")[-1] samples = load_wav_file(fileName) # AddImpulseResponse augmenter = Compose([ AddImpulseResponse(p=1.0, ir_path=os.path.join(auxiliarSoundsDir, "helperSounds/ir")) ]) output_file_path = os.path.join( output_dir, "{}_AddImpulseResponse_{:03d}.wav".format(name, 0)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # FrequencyMask augmenter = Compose([FrequencyMask(p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_FrequencyMask_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # TimeMask augmenter = Compose([TimeMask(p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_TimeMask_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # AddGaussianSNR augmenter = Compose([AddGaussianSNR(p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_AddGaussianSNR_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # AddGaussianNoise augmenter = Compose( [AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_AddGaussianNoise_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # TimeStretch augmenter = Compose([TimeStretch(min_rate=0.8, max_rate=1.25, p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_TimeStretch_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # PitchShift augmenter = Compose([PitchShift(min_semitones=-4, max_semitones=4, p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_itchShift_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # Shift augmenter = Compose([Shift(min_fraction=-0.5, max_fraction=0.5, p=1.0)]) for i in range(5): output_file_path = os.path.join(output_dir, "{}_Shift_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # Shift without rollover augmenter = Compose( [Shift(min_fraction=-0.5, max_fraction=0.5, rollover=False, p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_ShiftWithoutRollover_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # Normalize augmenter = Compose([Normalize(p=1.0)]) output_file_path = os.path.join(output_dir, "{}_Normalize_{:03d}.wav".format(name, 0)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # ClippingDistortion augmenter = Compose([ClippingDistortion(p=1.0)]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_ClippingDistortion_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # AddBackgroundNoise augmenter = Compose([ AddBackgroundNoise(sounds_path=os.path.join( auxiliarSoundsDir, "helperSounds/background_noises"), p=1.0) ]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_AddBackgroundNoise_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples) # AddShortNoises augmenter = Compose([ AddShortNoises( sounds_path=os.path.join(auxiliarSoundsDir, "helperSounds/short_noises"), min_snr_in_db=0, max_snr_in_db=8, min_time_between_sounds=2.0, max_time_between_sounds=4.0, burst_probability=0.4, min_pause_factor_during_burst=0.01, max_pause_factor_during_burst=0.95, min_fade_in_time=0.005, max_fade_in_time=0.08, min_fade_out_time=0.01, max_fade_out_time=0.1, p=1.0, ) ]) for i in range(5): output_file_path = os.path.join( output_dir, "{}_AddShortNoises_{:03d}.wav".format(name, i)) augmented_samples = augmenter(samples=samples, sample_rate=SAMPLE_RATE) wavfile.write(output_file_path, rate=SAMPLE_RATE, data=augmented_samples)
def __getitem__(self, idx): augment = Compose([ TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5), Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5, rollover=False) ]) temp1 = random.randint(0, 1) temp2 = random.randint(0, 1) temp3 = random.randint(0, 1) self.anchor = str(self.data.iloc[idx, 0]) self.positive = self.data.iloc[idx, 1] self.negative = self.data.iloc[idx, 2] self.signalAnchor, self.srAnchor = torchaudio.load(self.anchor) self.signalPositive, self.srPositive = torchaudio.load(self.positive) self.signalNegative, self.srNegative = torchaudio.load(self.negative) if (temp1 == 1): self.signalAnchor = torch.from_numpy( augment(samples=self.signalAnchor.numpy(), sample_rate=self.srAnchor)) if (temp2 == 1): self.signalPositive = torch.from_numpy( augment(samples=self.signalPositive.numpy(), sample_rate=self.srPositive)) if (temp3 == 1): self.signalNegative = torch.from_numpy( augment(samples=self.signalNegative.numpy(), sample_rate=self.srNegative)) self.spectogramAnchor = torchaudio.transforms.Spectrogram( n_fft=320, hop_length=160, win_length=320)(self.signalAnchor) self.logSpectogramAnchor = torchaudio.transforms.AmplitudeToDB()( self.spectogramAnchor) self.spectogramPositive = torchaudio.transforms.Spectrogram( n_fft=320, hop_length=160, win_length=320)(self.signalPositive) self.logSpectogramPositive = torchaudio.transforms.AmplitudeToDB()( self.spectogramPositive) self.spectogramNegative = torchaudio.transforms.Spectrogram( n_fft=320, hop_length=160, win_length=320)(self.signalNegative) self.logSpectogramNegative = torchaudio.transforms.AmplitudeToDB()( self.spectogramNegative) #self.tempImgAnchor=torchvision.transforms.ToPILImage()(self.logSpectogramAnchor) #self.tempImgAnchor=self.tempImgAnchor.convert("RGB") #self.spectogramAnchorImageTensor=self.vision_transform(self.tempImgAnchor) #self.tempImgPositive=torchvision.transforms.ToPILImage()(self.logSpectogramPositive) #self.tempImgPositive=self.tempImgPositive.convert("RGB") #self.spectogramPositiveImageTensor=self.vision_transform(self.tempImgPositive) #self.tempImgNegative=torchvision.transforms.ToPILImage()(self.logSpectogramNegative) #self.tempImgNegative=self.tempImgNegative.convert("RGB") #self.spectogramNegativeImageTensor=self.vision_transform(self.tempImgNegative) return self.logSpectogramAnchor, self.logSpectogramPositive, self.logSpectogramNegative