def test_normalize(): from speechbrain.processing.signal_processing import compute_amplitude from speechbrain.processing.signal_processing import rescale import random import numpy as np for scale in ["dB", "linear"]: for amp_type in ["peak", "avg"]: for test_vec in [ torch.zeros((100)), torch.rand((10, 100)), torch.rand((10, 100, 5)), ]: lengths = (test_vec.size(1) if len(test_vec.shape) > 1 else test_vec.size(0)) amp = compute_amplitude(test_vec, lengths, amp_type, scale) scaled_back = rescale( random.random() * test_vec, lengths, amp, amp_type, scale, ) np.testing.assert_array_almost_equal(scaled_back.numpy(), test_vec.numpy())
def audio_pipeline( mix_wav, ): # this is dummy --> it means one epoch will be same as without dynamic mixing """ This audio pipeline defines the compute graph for dynamic mixing """ speakers = np.random.choice( spk_list, hparams["num_spks"], replace=False, p=spk_weights ) if "wham" in Path(hparams["data_folder"]).stem: noise_file = np.random.choice(noise_files, 1, replace=False) noise, fs_read = torchaudio.load(noise_file[0]) noise = noise.squeeze() # gain = np.clip(random.normalvariate(1, 10), -4, 15) # noise = rescale(noise, torch.tensor(len(noise)), gain, scale="dB").squeeze() # select two speakers randomly sources = [] first_lvl = None spk_files = [ np.random.choice(spk_hashtable[spk], 1, False)[0] for spk in speakers ] minlen = min( *[torchaudio.info(x).num_frames for x in spk_files], hparams["training_signal_len"], ) for i, spk_file in enumerate(spk_files): # select random offset length = torchaudio.info(spk_file).num_frames start = 0 stop = length if length > minlen: # take a random window start = np.random.randint(0, length - minlen) stop = start + minlen tmp, fs_read = torchaudio.load( spk_file, frame_offset=start, num_frames=stop - start, ) # peak = float(Path(spk_file).stem.split("_peak_")[-1]) tmp = tmp[0] # * peak # remove channel dim and normalize if i == 0: gain = np.clip(random.normalvariate(-27.43, 2.57), -45, 0) tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB") # assert not torch.all(torch.isnan(tmp)) first_lvl = gain else: gain = np.clip( first_lvl + random.normalvariate(-2.51, 2.66), -45, 0 ) tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB") # assert not torch.all(torch.isnan(tmp)) sources.append(tmp) # we mix the sources together # here we can also use augmentations ! -> runs on cpu and for each # mixture parameters will be different rather than for whole batch. # no difference however for bsz=1 :) # padding left # sources, _ = batch_pad_right(sources) sources = torch.stack(sources) mixture = torch.sum(sources, 0) if "wham" in Path(hparams["data_folder"]).stem: len_noise = len(noise) len_mix = len(mixture) min_len = min(len_noise, len_mix) mixture = mixture[:min_len] + noise[:min_len] max_amp = max( torch.abs(mixture).max().item(), *[x.item() for x in torch.abs(sources).max(dim=-1)[0]], ) mix_scaling = 1 / max_amp * 0.9 sources = mix_scaling * sources mixture = mix_scaling * mixture yield mixture for i in range(hparams["num_spks"]): yield sources[i] # If the number of speakers is 2, yield None for the 3rd speaker if hparams["num_spks"] == 2: yield None if "wham" in Path(hparams["data_folder"]).stem: mean_source_lvl = sources.abs().mean() mean_noise_lvl = noise.abs().mean() noise = (mean_source_lvl / mean_noise_lvl) * noise yield noise else: yield None
def create_mixture(session_n, output_dir, params, metadata): os.makedirs(os.path.join(output_dir, session_n), exist_ok=True) session_meta = {} speakers = [ x for x in metadata.keys() if x not in ["noises", "background"] ] tot_length = int( np.ceil(metadata["background"]["stop"] * params["samplerate"])) mixture = torch.zeros(tot_length) # total mixture file assert len(mixture) > 0, "Mixture has length 0, please raise max_length." # step 1 for spk in speakers: session_meta[spk] = [] # we create mixture for each speaker and we optionally save it. if params["save_dry_sources"]: dry = torch.zeros(tot_length) if params["save_wet_sources"]: wet = torch.zeros(tot_length) for utt in metadata[spk]: c_audio, fs = torchaudio.load( os.path.join(params["librispeech_root"], utt["file"])) assert fs == params["samplerate"] if len(c_audio.shape) > 1: # multichannel c_audio = c_audio[utt["channel"], :] c_audio = c_audio - torch.mean(c_audio) c_audio = rescale( c_audio, c_audio.size(0), utt["lvl"], scale="dB", amp_type="peak", ) # we save it in dry dry_start = int(utt["start"] * params["samplerate"]) dry_stop = dry_start + c_audio.shape[-1] if params["save_dry_sources"]: dry[dry_start:dry_stop] += c_audio # we add now reverb and put it in wet c_rir, fs = torchaudio.load( os.path.join(params["rirs_noises_root"], utt["rir"])) assert fs == params["samplerate"] c_rir = c_rir[utt["rir_channel"], :] c_audio = reverberate(c_audio, c_rir, "peak") # tof is not accounted because in reverberate we shift by it wet_start = dry_start wet_stop = dry_stop # + early_rev_samples if params["save_wet_sources"]: wet[wet_start:wet_start + len(c_audio)] += c_audio session_meta[spk].append({ "start": np.round(wet_start / params["samplerate"], 3), "stop": np.round(wet_stop / params["samplerate"], 3), "lvl": utt["lvl"], "words": utt["words"], "file": utt["file"], "channel": utt["channel"], "rir": utt["rir"], "rir_channels": utt["rir_channel"], }) # we add to mixture mixture[wet_start:wet_start + len(c_audio)] += c_audio # we allow for clipping as it occurs also in real recordings. # save per speaker clean sources if params["save_dry_sources"]: torchaudio.save( os.path.join( output_dir, session_n, "session_{}_spk_{}_dry.wav".format(session_n, spk), ), torch.clamp(dry, min=-1, max=1), params["samplerate"], ) if params["save_wet_sources"]: torchaudio.save( os.path.join( output_dir, session_n, "session_{}_spk_{}_wet.wav".format(session_n, spk), ), torch.clamp(wet, min=-1, max=1), params["samplerate"], ) with open(os.path.join(output_dir, session_n, "{}.json".format(session_n)), "w") as f: json.dump(session_meta, f, indent=4) # add impulsive noises for noise_event in metadata["noises"]: c_audio, fs = torchaudio.load( os.path.join(params["rirs_noises_root"], noise_event["file"])) assert fs == params["samplerate"] if len(c_audio.shape) > 1: # multichannel c_audio = c_audio[noise_event["channel"], :] c_audio = c_audio - torch.mean(c_audio) c_audio = rescale( c_audio, c_audio.size(0), noise_event["lvl"], scale="dB", amp_type="peak", ) # we save it in dry dry_start = int(noise_event["start"] * params["samplerate"]) # dry_stop = dry_start + c_audio.shape[-1] # we add now reverb and put it in wet c_rir, fs = torchaudio.load( os.path.join(params["rirs_noises_root"], noise_event["rir"])) assert fs == params["samplerate"] c_rir = c_rir[noise_event["rir_channel"], :] c_audio = reverberate(c_audio, c_rir, "peak") # tof is not accounted because in reverberate we shift by it wet_start = dry_start mixture[wet_start:wet_start + len(c_audio)] += c_audio # add background if metadata["background"]["file"]: c_audio, fs = torchaudio.load( os.path.join(params["backgrounds_root"], metadata["background"]["file"]), frame_offset=metadata["background"]["orig_start"], num_frames=mixture.shape[-1], ) assert fs == params["samplerate"] if len(c_audio.shape) > 1: # multichannel c_audio = c_audio[metadata["background"]["channel"], :] c_audio = c_audio - torch.mean(c_audio) c_audio = rescale( c_audio, c_audio.size(0), metadata["background"]["lvl"], scale="dB", amp_type="avg", ) mixture += c_audio else: # add gaussian noise mixture += rescale( torch.normal(0, 1, mixture.shape), mixture.size(0), metadata["background"]["lvl"], scale="dB", amp_type="peak", ) # save total mixture mixture = torch.clamp(mixture, min=-1, max=1) torchaudio.save( os.path.join(output_dir, session_n, "{}_mixture.wav".format(session_n)), mixture.unsqueeze(0), params["samplerate"], )
def audio_pipeline( mix_wav, ): # this is dummy --> it means one epoch will be same as without dynamic mixing speakers = np.random.choice(spk_list, hparams["num_spks"], replace=False, p=spk_weights) # select two speakers randomly sources = [] first_lvl = None spk_files = [ np.random.choice(spk_hashtable[spk], 1, False)[0] for spk in speakers ] minlen = min( *[torchaudio.info(x).num_frames for x in spk_files], hparams["training_signal_len"], ) for i, spk_file in enumerate(spk_files): # select random offset length = torchaudio.info(spk_file).num_frames start = 0 stop = length if length > minlen: # take a random window start = np.random.randint(0, length - minlen) stop = start + minlen tmp, fs_read = torchaudio.load( spk_file, frame_offset=start, num_frames=stop - start, ) # peak = float(Path(spk_file).stem.split("_peak_")[-1]) tmp = tmp[0] # * peak # remove channel dim and normalize if i == 0: gain = np.clip(random.normalvariate(-27.43, 2.57), -45, 0) tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB") # assert not torch.all(torch.isnan(tmp)) first_lvl = gain else: gain = np.clip(first_lvl + random.normalvariate(-2.51, 2.66), -45, 0) tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB") # assert not torch.all(torch.isnan(tmp)) sources.append(tmp) # we mix the sources together # here we can also use augmentations ! -> runs on cpu and for each # mixture parameters will be different rather than for whole batch. # no difference however for bsz=1 :) # padding left # sources, _ = batch_pad_right(sources) sources = torch.stack(sources) mixture = torch.sum(sources, 0) max_amp = max( torch.abs(mixture).max().item(), *[x.item() for x in torch.abs(sources).max(dim=-1)[0]], ) mix_scaling = 1 / max_amp * 0.9 sources = sources * mix_scaling mixture = mix_scaling * mixture yield mixture for i in range(hparams["num_spks"]): yield sources[i]
def audio_pipeline( mix_wav, ): # this is dummy --> it means one epoch will be same as without dynamic mixing """ This audio pipeline defines the compute graph for dynamic mixing """ speakers = np.random.choice(spk_list, num_spks, replace=False, p=spk_weights) if "wham" in Path(data_root_folder).stem: noise_file = np.random.choice(noise_files, 1, replace=False) noise, fs_read = torchaudio.load(noise_file[0]) noise = noise.squeeze() # select two speakers randomly sources = [] first_lvl = None spk_files = [ np.random.choice(spk_hashtable[spk], 1, False)[0] for spk in speakers ] minlen = min( *[torchaudio.info(x).num_frames for x in spk_files], max_training_signal_len, ) for i, spk_file in enumerate(spk_files): # select random offset length = torchaudio.info(spk_file).num_frames start = 0 stop = length if length > minlen: # take a random window start = np.random.randint(0, length - minlen) stop = start + minlen tmp, fs_read = torchaudio.load( spk_file, frame_offset=start, num_frames=stop - start, ) tmp = tmp[0] # * peak # remove channel dim and normalize if i == 0: gain = np.clip(random.normalvariate(-27.43, 2.57), -45, 0) tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB") first_lvl = gain else: gain = np.clip(first_lvl + random.normalvariate(-2.51, 2.66), -45, 0) tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB") sources.append(tmp) # we mix the sources together sources = torch.stack(sources) mixture = torch.sum(sources, 0) if "wham" in Path(data_root_folder).stem: len_noise = len(noise) len_mix = len(mixture) min_len = min(len_noise, len_mix) mixture = mixture[:min_len] + noise[:min_len] max_amp = max( torch.abs(mixture).max().item(), *[x.item() for x in torch.abs(sources).max(dim=-1)[0]], ) mix_scaling = 1 / max_amp * 0.9 sources = mix_scaling * sources mixture = mix_scaling * mixture yield mixture for i in range(num_spks): yield sources[i] # If the number of speakers is 2, yield None for the 3rd speaker if num_spks == 2: yield None if "wham" in Path(data_root_folder).stem: mean_source_lvl = sources.abs().mean() mean_noise_lvl = noise.abs().mean() noise = (mean_source_lvl / mean_noise_lvl) * noise yield noise else: yield None