def main(): overrides = { "output_folder": output_folder, "data_folder": os.path.join(experiment_dir, "..", "..", "..", "samples"), } with open(hyperparams_file) as fin: hyperparams = load_hyperpyyaml(fin, overrides) sb.create_experiment_directory( experiment_directory=output_folder, hyperparams_to_save=hyperparams_file, overrides=overrides, ) dataloader = sb.dataio.dataloader.make_dataloader( dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"]) for ( id, (wav, wav_len), ) in iter(dataloader): wav_drop = hyperparams["drop_freq"](wav) # save results on file for i, snt_id in enumerate(id): filepath = (hyperparams["output_folder"] + "/save/" + snt_id + ".flac") write_audio(filepath, wav_drop[i], 16000)
def test_read_audio(tmpdir): from speechbrain.dataio.dataio import read_audio, write_audio test_waveform = torch.rand(16000) wavfile = os.path.join(tmpdir, "wave.wav") write_audio(wavfile, test_waveform, 16000) # dummy annotation for i in range(3): start = torch.randint(0, 8000, (1, )).item() stop = start + torch.randint(500, 1000, (1, )).item() wav_obj = {"wav": {"file": wavfile, "start": start, "stop": stop}} loaded = read_audio(wav_obj["wav"]) assert loaded.allclose(test_waveform[start:stop], atol=1e-4)
def test_add_noise(tmpdir, device): from speechbrain.processing.speech_augmentation import AddNoise # Test concatenation of batches wav_a = torch.sin(torch.arange(8000.0, device=device)).unsqueeze(0) a_len = torch.ones(1, device=device) wav_b = (torch.cos(torch.arange(10000.0, device=device)).unsqueeze(0).repeat(2, 1)) b_len = torch.ones(2, device=device) concat, lens = AddNoise._concat_batch(wav_a, a_len, wav_b, b_len) assert concat.shape == (3, 10000) assert lens.allclose(torch.Tensor([0.8, 1, 1]).to(device)) concat, lens = AddNoise._concat_batch(wav_b, b_len, wav_a, a_len) assert concat.shape == (3, 10000) expected = torch.Tensor([1, 1, 0.8]).to(device) assert lens.allclose(expected) test_waveform = torch.sin(torch.arange(16000.0, device=device)).unsqueeze(0) test_noise = torch.cos(torch.arange(16000.0, device=device)).unsqueeze(0) wav_lens = torch.ones(1, device=device) # Put noise waveform into temporary file noisefile = os.path.join(tmpdir, "noise.wav") write_audio(noisefile, test_noise.transpose(0, 1).cpu(), 16000) csv = os.path.join(tmpdir, "noise.csv") with open(csv, "w") as w: w.write("ID, duration, wav, wav_format, wav_opts\n") w.write(f"1, 1.0, {noisefile}, wav,\n") # Edge cases no_noise = AddNoise(mix_prob=0.0).to(device) assert no_noise(test_waveform, wav_lens).allclose(test_waveform) no_noise = AddNoise(snr_low=1000, snr_high=1000) assert no_noise(test_waveform, wav_lens).allclose(test_waveform) all_noise = AddNoise(csv_file=csv, snr_low=-1000, snr_high=-1000) assert all_noise(test_waveform, wav_lens).allclose(test_noise, atol=1e-4) # Basic 0dB case add_noise = AddNoise(csv_file=csv).to(device) expected = (test_waveform + test_noise) / 2 assert add_noise(test_waveform, wav_lens).allclose(expected, atol=1e-4)
def test_read_audio_multichannel(tmpdir): from speechbrain.dataio.dataio import read_audio_multichannel, write_audio test_waveform = torch.rand(16000, 2) wavfile = os.path.join(tmpdir, "wave.wav") # sf.write(wavfile, test_waveform, 16000, subtype="float") write_audio(wavfile, test_waveform, 16000) # dummy annotation we save and load one multichannel file for i in range(2): start = torch.randint(0, 8000, (1, )).item() stop = start + torch.randint(500, 1000, (1, )).item() wav_obj = {"wav": {"files": [wavfile], "start": start, "stop": stop}} loaded = read_audio_multichannel(wav_obj["wav"]) assert loaded.allclose(test_waveform[start:stop, :], atol=1e-4) # set to equal when switching to the sox_io backend # assert torch.all(torch.eq(loaded, test_waveform[:,start:stop])) # we test now multiple files loading test_waveform_2 = torch.rand(16000, 2) wavfile_2 = os.path.join(tmpdir, "wave_2.wav") write_audio(wavfile_2, test_waveform_2, 16000) # sf.write(wavfile_2, test_waveform_2, 16000, subtype="float") for i in range(2): start = torch.randint(0, 8000, (1, )).item() stop = start + torch.randint(500, 1000, (1, )).item() wav_obj = { "wav": { "files": [wavfile, wavfile_2], "start": start, "stop": stop } } loaded = read_audio_multichannel(wav_obj["wav"]) test_waveform3 = torch.cat( (test_waveform[start:stop, :], test_waveform_2[start:stop, :]), 1) assert loaded.allclose(test_waveform3, atol=1e-4)
def test_add_reverb(tmpdir, device): from speechbrain.processing.speech_augmentation import AddReverb test_waveform = torch.sin(torch.arange(16000.0, device=device)).unsqueeze(0) impulse_response = torch.zeros(1, 8000, device=device) impulse_response[0, 0] = 1.0 wav_lens = torch.ones(1, device=device) # Put ir waveform into temporary file ir1 = os.path.join(tmpdir, "ir1.wav") ir2 = os.path.join(tmpdir, "ir2.wav") ir3 = os.path.join(tmpdir, "ir3.wav") write_audio(ir1, impulse_response.cpu().transpose(0, 1), 16000) impulse_response[0, 0] = 0.0 impulse_response[0, 10] = 0.5 write_audio(ir2, impulse_response.cpu().transpose(0, 1), 16000) # Check a very simple non-impulse-response case: impulse_response[0, 10] = 0.6 impulse_response[0, 11] = 0.4 # sf.write(ir3, impulse_response.squeeze(0).numpy(), 16000) write_audio(ir3, impulse_response.cpu().transpose(0, 1), 16000) ir3_result = test_waveform * 0.6 + test_waveform.roll(1, -1) * 0.4 # write ir csv file csv = os.path.join(tmpdir, "ir.csv") with open(csv, "w") as w: w.write("ID, duration, wav, wav_format, wav_opts\n") w.write(f"1, 0.5, {ir1}, wav,\n") w.write(f"2, 0.5, {ir2}, wav,\n") w.write(f"3, 0.5, {ir3}, wav,\n") # Edge case no_reverb = AddReverb(csv, reverb_prob=0.0).to(device) assert no_reverb(test_waveform, wav_lens).allclose(test_waveform) # Normal cases add_reverb = AddReverb(csv, sorting="original") reverbed = add_reverb(test_waveform, wav_lens)[:, 0:1000] assert reverbed.allclose(test_waveform[:, 0:1000], atol=1e-1) reverbed = add_reverb(test_waveform, wav_lens)[:, 0:1000] assert reverbed.allclose(test_waveform[:, 0:1000], atol=1e-1) reverbed = add_reverb(test_waveform, wav_lens)[:, 0:1000] assert reverbed.allclose(ir3_result[:, 0:1000], atol=2e-1)
def save_mixture( s1, s2, min_max, weight_1, weight_2, num_files, lev1, lev2, save_fs, output_dir, data_type, mix_name, i, ): """ This function creates the mixtures, and saves them Arguments: s1, s1 (numpy array): source1 and source2 wav files in numpy array. weight_1, weight_2 (float): weights for source1 and source2 respectively. num_files (int): number of files lev1, lev2 (float): levels for each souce obtained with octave.activlev() function save_fs (str): in ['wav8k', 'wav16k'] output_dir (str): the save directory data_type (str): in ['tr', 'cv', 'tt'] mix_name (str): name given to the mixture. (see the main function get_wsj_files()) i (int): number of the mixture. (see the main function get_wsj_files()) """ scaling = np.zeros((num_files, 2)) scaling16bit = np.zeros((num_files, 1)) if min_max == "max": mix_len = max(s1.shape[0], s2.shape[0]) s1 = np.pad( s1, (0, mix_len - s1.shape[0]), "constant", constant_values=(0, 0), ) s2 = np.pad( s2, (0, mix_len - s2.shape[0]), "constant", constant_values=(0, 0), ) else: mix_len = min(s1.shape[0], s2.shape[0]) s1 = s1[:mix_len] s2 = s2[:mix_len] mix = s1 + s2 max_amp = max(np.abs(mix).max(), np.abs(s1).max(), np.abs(s2).max(),) mix_scaling = 1 / max_amp * 0.9 s1 = mix_scaling * s1 s2 = mix_scaling * s2 mix = mix_scaling * mix scaling[i, 0] = weight_1 * mix_scaling / np.sqrt(lev1) scaling[i, 1] = weight_2 * mix_scaling / np.sqrt(lev2) scaling16bit[i] = mix_scaling sampling_rate = 8000 if save_fs == "wav8k" else 16000 write_audio( s1, output_dir + "/" + save_fs + "/" + min_max + "/" + data_type + "/s1/" + mix_name + ".wav", sampling_rate=sampling_rate, ) write_audio( s2, output_dir + "/" + save_fs + "/" + min_max + "/" + data_type + "/s2/" + mix_name + ".wav", sampling_rate=sampling_rate, ) write_audio( mix, output_dir + "/" + save_fs + "/" + min_max + "/" + data_type + "/mix/" + mix_name + ".wav", sampling_rate=sampling_rate, ) return scaling, scaling16bit
os.makedirs("TIMIT_combined/test/" + i) for i in range(len(train_df)): if i % 4 == 0: fname = ( train_df["location"][i][len("TIMIT_4_channels/train/") :] .split(".")[0] .split("_")[0] + ".wav" ) mic1 = read_audio(train_df["location"][i]) mic2 = read_audio(train_df["location"][i + 1]) mic3 = read_audio(train_df["location"][i + 2]) mic4 = read_audio(train_df["location"][i + 3]) sa = torch.stack((mic1, mic2, mic3, mic4)).transpose(0, 1) write_audio("TIMIT_combined/train/" + fname, sa, samplerate=fs) for i in range(len(test_df)): if i % 4 == 0: fname = ( test_df["location"][i][len("TIMIT_4_channels/test/") :] .split(".")[0] .split("_")[0] + ".wav" ) mic1 = read_audio(test_df["location"][i]) mic2 = read_audio(test_df["location"][i + 1]) mic3 = read_audio(test_df["location"][i + 2]) mic4 = read_audio(test_df["location"][i + 3]) sa = torch.stack((mic1, mic2, mic3, mic4)).transpose(0, 1) write_audio("TIMIT_combined/test/" + fname, sa, samplerate=fs)
for i in range(len(train_df)): if i % 4 == 0: fname = ( train_df["location"][i][len("TIMIT_4_channels/train/") :] .split(".")[0] .split("_")[0] + ".wav" ) mic1 = read_audio(train_df["location"][i]) mic2 = read_audio(train_df["location"][i + 1]) mic3 = read_audio(train_df["location"][i + 2]) mic4 = read_audio(train_df["location"][i + 3]) sa = torch.stack((mic1, mic2, mic3, mic4)).transpose(0, 1) write_audio("TIMIT_combined/train/" + fname, sa, samplerate=fs) for i in range(len(test_df)): if i % 4 == 0: fname = ( test_df["location"][i][len("TIMIT_4_channels/test/") :] .split(".")[0] .split("_")[0] + ".wav" ) mic1 = read_audio(test_df["location"][i]) mic2 = read_audio(test_df["location"][i + 1]) mic3 = read_audio(test_df["location"][i + 2]) mic4 = read_audio(test_df["location"][i + 3]) sa = torch.stack((mic1, mic2, mic3, mic4)).transpose(0, 1) write_audio("TIMIT_combined/test/" + fname, sa, samplerate=fs)
def main(): experiment_dir = os.path.dirname(os.path.realpath(__file__)) hparams_file = os.path.join(experiment_dir, "hyperparams.yaml") data_folder = "../../../../samples/audio_samples/sourcesep_samples" data_folder = os.path.realpath(os.path.join(experiment_dir, data_folder)) with open(hparams_file) as fin: hparams = load_hyperpyyaml(fin, {"data_folder": data_folder}) sb.create_experiment_directory( experiment_directory=hparams["output_folder"], hyperparams_to_save=hparams_file, ) torch.manual_seed(0) NMF1 = NMF_Brain(hparams=hparams) train_loader = sb.dataio.dataloader.make_dataloader( hparams["train_data"], **hparams["loader_kwargs"]) NMF1.init_matrices(train_loader) print("fitting model 1") NMF1.fit( train_set=train_loader, valid_set=None, epoch_counter=range(hparams["N_epochs"]), progressbar=False, ) W1hat = NMF1.training_out[1] NMF2 = NMF_Brain(hparams=hparams) train_loader = sb.dataio.dataloader.make_dataloader( hparams["train_data"], **hparams["loader_kwargs"]) NMF2.init_matrices(train_loader) print("fitting model 2") NMF2.fit( train_set=train_loader, valid_set=None, epoch_counter=range(hparams["N_epochs"]), progressbar=False, ) W2hat = NMF2.training_out[1] # separate mixture_loader = sb.dataio.dataloader.make_dataloader( hparams["test_data"], **hparams["loader_kwargs"]) mix_batch = next(iter(mixture_loader)) Xmix = NMF1.hparams.compute_features(mix_batch.wav.data) Xmix_mag = spectral_magnitude(Xmix, power=2) X1hat, X2hat = sb_nmf.NMF_separate_spectra([W1hat, W2hat], Xmix_mag) x1hats, x2hats = sb_nmf.reconstruct_results( X1hat, X2hat, Xmix.permute(0, 2, 1, 3), hparams["sample_rate"], hparams["win_length"], hparams["hop_length"], ) if hparams["save_reconstructed"]: savepath = "results/save/" if not os.path.exists("results"): os.mkdir("results") if not os.path.exists(savepath): os.mkdir(savepath) for i, (x1hat, x2hat) in enumerate(zip(x1hats, x2hats)): write_audio( os.path.join(savepath, "separated_source1_{}.wav".format(i)), x1hat.squeeze(0), 16000, ) write_audio( os.path.join(savepath, "separated_source2_{}.wav".format(i)), x2hat.squeeze(0), 16000, ) if hparams["copy_original_files"]: datapath = "samples/audio_samples/sourcesep_samples" filedir = os.path.dirname(os.path.realpath(__file__)) speechbrain_path = os.path.abspath(os.path.join( filedir, "../../../..")) copypath = os.path.realpath(os.path.join(speechbrain_path, datapath)) all_files = os.listdir(copypath) wav_files = [fl for fl in all_files if ".wav" in fl] for wav_file in wav_files: shutil.copy(copypath + "/" + wav_file, savepath)