def create_stft(M: int, options: dict): L = options['stft_size'] hop = L // 2 # window = pra.hann(L, flag='asymmetric', length='full') window = pra.hamming(L, flag='asymmetric', length='full') # looks like hamming window is better return pra.transform.STFT(L, hop=hop, analysis_window=window, channels=M)
def process_audio(save_dir, wav_file): print(wav_file) fsResample = 48000 # resampling frequency [Hz] fftSize = 4096 # window length in STFT [points] shiftSize = 2048 # shift length in STFT [points] ns = 7 # number of sources it = 70 # number of iterations nb = 10 # number of bases # analysis window win_a = pra.hamming(fftSize) # optimal synthesis window win_s = pra.transform.compute_synthesis_window(win_a, shiftSize) mix = sf.read(str(wav_file))[0] # pdb.set_trace() # STFT X = pra.transform.analysis(mix, fftSize, shiftSize, win=win_a) # Apply FastMNMF Y = pra.bss.fastmnmf(X, n_src=ns, n_iter=it, n_components=nb) # ISTFT y = pra.transform.synthesis(Y, fftSize, shiftSize, win=win_s) outputDir = './output' os.makedirs(outputDir, exist_ok=True) # observed signal sf.write('{}/observedMixture.wav'.format(outputDir), mix, fsResample) wav_idx = wav_file.stem.split('.')[-1] for i in range(ns): # estimated signal 1 sf.write(f'{outputDir}/estimatedSignal{i}.wav', y[:, i], fsResample) if i == 1: AUDIO_FILE = 'output/estimatedSignal1.wav' r = sr.Recognizer() with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) text = r.recognize_google(audio, language="ja-JP") wav_filename = f'{save_dir}/{wav_idx}.wav' os.system(f'mv {AUDIO_FILE} {wav_filename}') return wav_filename, text
# normalize all sources at the reference mic premix /= np.std(premix[:, ref_mic, None, :], axis=2, keepdims=True) for n_sources in config["n_sources_list"]: # Do the mix and add noise mix = np.sum(premix[:, :n_sources, :], axis=0) noise_std = 10 ** (-config["snr"] / 20) * np.std(mix[ref_mic, :]) mix += noise_std * np.random.randn(*mix.shape) ref = premix[:n_sources, ref_mic, :] # STFT n_fft = config["stft_params"]["n_fft"] hop = config["stft_params"]["hop"] if config["stft_params"]["win"] == "hamming": win_a = pra.hamming(n_fft) else: raise ValueError("Undefined window function") win_s = pra.transform.compute_synthesis_window(win_a, hop) X = pra.transform.analysis(mix.T, n_fft, hop, win=win_a) n_iter = config["separation_params"]["n_iter_multiplier"] * n_sources # Separation for algo, details in config["algorithms"].items(): t1 = time.perf_counter() if details["name"] == "auxiva": Y = auxiva(
mix += np.random.randn(*mix.shape) * sigma_n print("SNR:", 10 * np.log10(sigma_src**2 / sigma_n**2)) # the reference if args.algo in dereverb_algos: # for dereverberation algorithms we use the anechoic reference signal fn_ref = DATA_DIR / rooms[args.room]["anechoic_filenames"][REF_MIC] else: fn_ref = DATA_DIR / rooms[args.room]["src_filenames"][REF_MIC] fs, ref = wavfile.read(fn_ref) ref = ref.astype(np.float64) / 2**15 # STFT parameters hop = args.block // 2 win_a = pra.hamming(args.block) win_s = pra.transform.stft.compute_synthesis_window(win_a, hop) # STFT X = stft.analysis(mix, args.block, hop, win=win_a) t1 = time.perf_counter() # Separation if args.algo == "fastmnmf": Y = algorithms[args.algo](X, n_iter=30) elif args.algo in dereverb_algos: if args.p is None: Y = algorithms[args.algo]( X, n_iter=15 * args.mics,
use_real_R = False # fix the randomness for repeatability np.random.seed(args.seed) # set the source powers, the first one is half source_std = np.ones(n_sources_target) source_std[0] /= np.sqrt(2.0) SINR = args.sinr # signal-to-interference-and-noise ratio SINR_diffuse_ratio = 0.9999 # ratio of uncorrelated to diffuse noise # STFT parameters framesize = 4096 hop = framesize // 2 win_a = pra.hamming(framesize) win_s = pra.transform.compute_synthesis_window(win_a, hop) # algorithm parameters n_iter = args.n_iter # param ogive ogive_mu = 0.1 ogive_update = "switching" ogive_iter = 4000 # Geometry of the room and location of sources and microphones room_dim = np.array([10, 7.5, 3]) mic_locs = semi_circle_layout( [4.1, 3.76, 1.2], np.pi, 0.20, n_mics, rot=np.pi / 2.0 * 0.99 )
def one_loop(args): global parameters import time import numpy np = numpy import pyroomacoustics pra = pyroomacoustics import os import sys sys.path.append(parameters["base_dir"]) from auxiva_pca import auxiva_pca, pca_separation from five import five from ive import ogive from overiva import overiva from pyroomacoustics.bss.common import projection_back from room_builder import callback_noise_mixer, random_room_builder # import samples helper routine from get_data import samples_dir sys.path.append(os.path.join(parameters['base_dir'], samples_dir)) from generate_samples import wav_read_center n_targets, n_interferers, n_mics, sinr, wav_files, room_seed, seed = args # this is the underdetermined case. We don't do that. if n_mics < n_targets: return [] # set MKL to only use one thread if present try: import mkl mkl.set_num_threads(1) except ImportError: pass # set the RNG seed rng_state = np.random.get_state() np.random.seed(seed) # STFT parameters framesize = parameters["stft_params"]["framesize"] hop = parameters["stft_params"]["hop"] if parameters["stft_params"]["window"] == "hann": win_a = pra.hamming(framesize) else: # default is Hann win_a = pra.hann(framesize) win_s = pra.transform.compute_synthesis_window(win_a, hop) # Generate the audio signals # get the simulation parameters from the json file # Simulation parameters sources_var = np.ones(n_targets) # total number of sources n_sources = n_targets + n_interferers # Read the signals wav_files = [os.path.join(parameters["base_dir"], fn) for fn in wav_files] signals = wav_read_center(wav_files[:n_sources], seed=123) # Get a random room room, rt60 = random_room_builder(signals, n_mics, seed=room_seed, **parameters["room_params"]) premix = room.simulate(return_premix=True) # mix the signal n_samples = premix.shape[2] mix = callback_noise_mixer( premix, sinr=sinr, diffuse_ratio=parameters["sinr_diffuse_ratio"], n_src=n_sources, n_tgt=n_targets, tgt_std=np.sqrt(sources_var), ref_mic=parameters["ref_mic"], ) # sum up the background # shape (n_mics, n_samples) background = np.sum(premix[n_targets:n_sources, :, :], axis=0) # shape (n_targets+1, n_samples, n_mics) ref = np.zeros((n_targets + 1, premix.shape[2], premix.shape[1]), dtype=premix.dtype) ref[:n_targets, :, :] = premix[:n_targets, :, :].swapaxes(1, 2) ref[n_targets, :, :] = background.T synth = np.zeros_like(ref) # START BSS ########### # shape: (n_frames, n_freq, n_mics) X_all = pra.transform.analysis(mix.T, framesize, hop, win=win_a) X_mics = X_all[:, :, :n_mics] # convergence monitoring callback def convergence_callback(Y, X, n_targets, SDR, SIR, eval_time, ref, framesize, win_s, algo_name): t_in = time.perf_counter() # projection back z = projection_back(Y, X[:, :, 0]) Y = Y * np.conj(z[None, :, :]) from mir_eval.separation import bss_eval_sources if Y.shape[2] == 1: y = pra.transform.synthesis(Y[:, :, 0], framesize, hop, win=win_s)[:, None] else: y = pra.transform.synthesis(Y, framesize, hop, win=win_s) if algo_name not in parameters["overdet_algos"]: new_ord = np.argsort(np.std(y, axis=0))[::-1] y = y[:, new_ord] m = np.minimum(y.shape[0] - hop, ref.shape[1]) synth[:n_targets, :m, 0] = y[hop:m + hop, :n_targets].T synth[n_targets, :m, 0] = y[hop:m + hop, 0] sdr, sir, sar, perm = bss_eval_sources(ref[:n_targets + 1, :m, 0], synth[:, :m, 0]) SDR.append(sdr[:n_targets].tolist()) SIR.append(sir[:n_targets].tolist()) t_out = time.perf_counter() eval_time.append(t_out - t_in) # store results in a list, one entry per algorithm results = [] # compute the initial values of SDR/SIR init_sdr = [] init_sir = [] convergence_callback(X_mics, X_mics, n_targets, init_sdr, init_sir, [], ref, framesize, win_s, "init") for full_name, params in parameters["algorithm_kwargs"].items(): name = params["algo"] kwargs = params["kwargs"] if name == "auxiva_pca" and n_targets == 1: # PCA doesn't work for single source scenario continue elif name in ["ogive", "five"] and n_targets != 1: # OGIVE is only for single target continue results.append({ "algorithm": full_name, "n_targets": n_targets, "n_interferers": n_interferers, "n_mics": n_mics, "rt60": rt60, "sinr": sinr, "seed": seed, "sdr": [], "sir": [], # to store the result "runtime": np.nan, "eval_time": np.nan, "n_samples": n_samples, }) # this is used to keep track of time spent in the evaluation callback eval_time = [] def cb(Y): convergence_callback( Y, X_mics, n_targets, results[-1]["sdr"], results[-1]["sir"], eval_time, ref, framesize, win_s, name, ) # avoid one computation by using the initial values of sdr/sir results[-1]["sdr"].append(init_sdr[0]) results[-1]["sir"].append(init_sir[0]) try: t_start = time.perf_counter() if name == "auxiva": # Run AuxIVA # this calls full IVA when `n_src` is not provided Y = overiva(X_mics, callback=cb, **kwargs) elif name == "auxiva_pca": # Run AuxIVA Y = auxiva_pca(X_mics, n_src=n_targets, callback=cb, proj_back=False, **kwargs) elif name == "overiva": # Run BlinkIVA Y = overiva(X_mics, n_src=n_targets, callback=cb, proj_back=False, **kwargs) elif name == "overiva2": # Run BlinkIVA Y = overiva(X_mics, n_src=n_targets, callback=cb, proj_back=False, **kwargs) elif name == "five": # Run AuxIVE Y = five(X_mics, callback=cb, proj_back=False, **kwargs) elif name == "ilrma": # Run AuxIVA Y = pra.bss.ilrma(X_mics, callback=cb, proj_back=False, **kwargs) elif name == "ogive": # Run OGIVE Y = ogive(X_mics, callback=cb, proj_back=False, **kwargs) elif name == "pca": # Run PCA Y = pca_separation(X_mics, n_src=n_targets) else: continue t_finish = time.perf_counter() # The last evaluation convergence_callback( Y, X_mics, n_targets, results[-1]["sdr"], results[-1]["sir"], [], ref, framesize, win_s, name, ) results[-1]["eval_time"] = np.sum(eval_time) results[-1][ "runtime"] = t_finish - t_start - results[-1]["eval_time"] except: import os, json pid = os.getpid() # report last sdr/sir as np.nan results[-1]["sdr"].append(np.nan) results[-1]["sir"].append(np.nan) # now write the problem to file fn_err = os.path.join(parameters["_results_dir"], "error_{}.json".format(pid)) with open(fn_err, "a") as f: f.write(json.dumps(results[-1], indent=4)) # skip to next iteration continue # restore RNG former state np.random.set_state(rng_state) return results
def run(args, parameters): """ This is the core loop of the simulation """ # expand arguments sinr, n_targets, n_interf, n_mics, dist_ratio, room_params, seed = args n_sources = n_targets + n_interf # this is the underdetermined case. We don't do that. if n_mics < n_targets: return [] # set the RNG seed rng_state = np.random.get_state() np.random.seed(seed) # get all the signals files_absolute = [ os.path.join(parameters["base_dir"], fn) for fn in room_params["wav"][:n_sources] ] source_signals = wav_read_center(files_absolute, seed=123) # create the room room = pra.ShoeBox(**room_params["room_kwargs"]) R = np.array(room_params["mic_array"]) room.add_microphone_array(pra.MicrophoneArray(R[:, :n_mics], room.fs)) source_locs = np.array(room_params["sources"]) for n in range(n_sources): room.add_source(source_locs[:, n], signal=source_signals[n, :]) # compute RIRs and RT60 room.compute_rir() rt60 = np.median([ pra.experimental.measure_rt60(room.rir[0][n], fs=room.fs) for n in range(n_targets) ]) # signals after propagation but before mixing # (n_sources, n_mics, n_samples) premix = room.simulate(return_premix=True) n_samples = premix.shape[-1] # create the mix (n_mics, n_samples) # this routine will also resize the signals in premix mix = callback_noise_mixer(premix, sinr=sinr, n_src=n_targets + n_interf, n_tgt=n_targets, **parameters["mix_params"]) # create the reference signals # (n_sources + 1, n_samples) refs = np.zeros((n_targets + 1, n_samples)) refs[:-1, :] = premix[:n_targets, parameters["mix_params"]["ref_mic"], :] refs[-1, :] = np.sum(premix[n_targets:, 0, :], axis=0) # STFT parameters framesize = parameters["stft_params"]["framesize"] hop = parameters["stft_params"]["hop"] if parameters["stft_params"]["window"] == "hann": win_a = pra.hamming(framesize) else: # default is Hann win_a = pra.hann(framesize) # START BSS ########### # shape: (n_frames, n_freq, n_mics) X_all = pra.transform.analysis(mix.T, framesize, hop, win=win_a) X_mics = X_all[:, :, :n_mics] # store results in a list, one entry per algorithm results = [] # compute the initial values of SDR/SIR init_sdr = [] init_sir = [] for full_name, params in parameters["algorithm_kwargs"].items(): name = params["algo"] kwargs = params["kwargs"] if not bss.is_determined[name] and bss.is_dual_update[ name] and n_targets == 1: # Overdetermined algorithms with dual updates cannot be used # in the single source case (they can extract at least two sources) continue elif bss.is_single_source[name] and n_targets > 1: # doesn't work for multi source scenario continue elif bss.is_overdetermined[name] and n_targets == n_mics: # don't run the overdetermined stuff in determined case continue results.append({ "algorithm": full_name, "n_targets": n_targets, "n_interferers": n_interf, "n_mics": n_mics, "rt60": rt60, "dist_ratio": dist_ratio, "sinr": sinr, "seed": seed, "sdr": [], "sir": [], # to store the result "cost": [], "runtime": np.nan, "eval_time": np.nan, "n_samples": n_samples, }) # this is used to keep track of time spent in the evaluation callback eval_time = [] def cb(W, Y, source_model): convergence_callback( W, Y, source_model, X_mics, n_targets, results[-1]["sdr"], results[-1]["sir"], results[-1]["cost"], eval_time, refs, parameters["mix_params"]["ref_mic"], parameters["stft_params"], name, not bss.is_determined[name], ) if "model" not in kwargs: local_model = bss.default.model else: local_model = kwargs["model"] cb(np.eye(n_mics)[None, :, :], X_mics, local_model) try: t_start = time.perf_counter() bss.separate(X_mics, n_src=n_targets, algorithm=name, callback=cb, proj_back=False, **kwargs) t_finish = time.perf_counter() results[-1]["eval_time"] = np.sum(eval_time) results[-1][ "runtime"] = t_finish - t_start - results[-1]["eval_time"] except Exception: # get the traceback tb = traceback.format_exc() report = { "algorithm": name, "n_src": n_targets, "kwargs": kwargs, "result": results[-1], "tb": tb, } pid = os.getpid() # report last sdr/sir as np.nan results[-1]["sdr"].append(np.nan) results[-1]["sir"].append(np.nan) # now write the problem to file fn_err = os.path.join(parameters["_results_dir"], "error_{}.json".format(pid)) with open(fn_err, "a") as f: f.write(json.dumps(report, indent=4)) f.write(",\n") # skip to next iteration continue # restore RNG former state np.random.set_state(rng_state) return results
sig = np.stack([sig_src1, sig_src2], axis=1) # 元の音源をリサンプリング (多項式補完) sig_src1 = signal.resample_poly(sig[:, :, 0], fsResample, fs) sig_src2 = signal.resample_poly(sig[:, :, 1], fsResample, fs) sig_resample = np.stack([sig_src1, sig_src2], axis=1) # 混合信号を作成 # 各チャネルごとに、音源の足し算 mix1 = sig_resample[:, 0, 0] + sig_resample[:, 0, 1] # 第0チャネル (left) mix2 = sig_resample[:, 1, 0] + sig_resample[:, 1, 1] # 第1チャネル (right) mixed = np.stack([mix1, mix2], axis=1) # ### 音源分離の実行 ### # 分析窓 win_a = pra.hamming(FFT_LENGTH) # 合成窓: 分析窓を事前に並べておく win_s = pra.transform.compute_synthesis_window(win_a, HOP_LENGTH) # 短時間フーリエ変換によるスペクトログラム作成 X = pra.transform.analysis(mixed, FFT_LENGTH, HOP_LENGTH, win=win_a) # ILRMA適用 Y = pra.bss.ilrma(X, n_src=N_SOURCES, n_iter=N_ITER, n_components=N_BASES) # 逆短時間フーリエ変換により音声に戻す y = pra.transform.synthesis(Y, FFT_LENGTH, HOP_LENGTH, win=win_s) # ### スペクトログラムの表示 ### # 分離前の音源
def convergence_callback( Y, source_model, X, n_targets, SDR, SIR, cost_list, eval_time, ref_sig, ref_mic, stft_params, algo_name, algo_is_overdetermined, ): global id_wav # we will keep track of how long this routine takes t_in = time.perf_counter() # Compute the current value of the IVA cost function cost_list.append(bss.cost_iva(X, Y, model=source_model)) # prepare STFT parameters framesize = stft_params["framesize"] hop = stft_params["hop"] if stft_params["window"] == "hamming": win_a = pra.hamming(framesize) else: # default is Hann win_a = pra.hann(framesize) win_s = pra.transform.compute_synthesis_window(win_a, hop) # projection back Y = bss.project_back(Y, X[:, :, ref_mic]) if Y.shape[2] == 1: y = pra.transform.synthesis(Y[:, :, 0], framesize, hop, win=win_s)[:, None] else: y = pra.transform.synthesis(Y, framesize, hop, win=win_s) y = y[framesize - hop:, :].astype(np.float64) if not algo_is_overdetermined: new_ord = np.argsort(np.std(y, axis=0))[::-1] y = y[:, new_ord] m = np.minimum(y.shape[0], ref_sig.shape[1]) synth = np.zeros_like(ref_sig) # in the overdetermined case, we also take into account the background for SIR computation synth[:n_targets, :m] = y[:m, :n_targets].T if synth.shape[0] > y.shape[1]: # here we copy the first source to fill the channel of the background synth[n_targets, :m] = y[:m, 0] if ref_sig.shape[0] > n_targets and np.sum(np.abs( ref_sig[n_targets, :])) < 1e-10: sdr, sir, sar, perm = si_bss_eval(ref_sig[:n_targets, :m].T, synth[:-1, :m].T) else: sdr, sir, sar, perm = si_bss_eval(ref_sig[:, :m].T, synth[:, :m].T) SDR.append(sdr[:n_targets].tolist()) SIR.append(sir[:n_targets].tolist()) t_out = time.perf_counter() eval_time.append(t_out - t_in)
def process(args, config): n_channels, room_id, bss_algo = args # the name of the algorithm we'll use for bss bss_algo_name = config["bss_algorithms"][bss_algo]["name"] if mkl_available: mkl.set_num_threads_local(1) ref_mic = config["ref_mic"] metadata_fn = Path(config["metadata_fn"]) dataset_dir = metadata_fn.parent with open(config["metadata_fn"], "r") as f: metadata = json.load(f) rooms = metadata[f"{n_channels}_channels"] # the mixtures fn_mix = dataset_dir / rooms[room_id]["mix_filename"] fs, mix = load_audio(fn_mix) # add the noise sigma_src = np.std(mix) sigma_n = sigma_src * 10**(-config["snr"] / 20) mix += np.random.randn(*mix.shape) * sigma_n # the reference if bss_algo_name in dereverb_algos: # for dereverberation algorithms we use the anechoic reference signal fn_ref = dataset_dir / rooms[room_id]["anechoic_filenames"][ config["ref_mic"]] else: fn_ref = dataset_dir / rooms[room_id]["src_filenames"][ config["ref_mic"]] fs, ref = load_audio(fn_ref) # STFT parameters nfft = config["stft"]["nfft"] hop = config["stft"]["hop"] if config["stft"]["window"] == "hamming": win_a = pra.hamming(nfft) win_s = pra.transform.stft.compute_synthesis_window(win_a, hop) else: raise ValueError("Window not implemented") # STFT X = stft.analysis(mix, nfft, hop, win=win_a) # Separation bss_kwargs = config["bss_algorithms"][bss_algo]["kwargs"] n_iter_p_ch = config["bss_algorithms"][bss_algo]["n_iter_per_channel"] runtime_bss = time.perf_counter() if bss_algo_name == "fastmnmf": Y = bss_algorithms[bss_algo_name](X, n_iter=n_iter_p_ch * n_channels, **bss_kwargs) elif bss_algo_name in dereverb_algos: Y, Y_pb, runtime_pb = bss_algorithms[bss_algo_name]( X, n_iter=n_iter_p_ch * n_channels, proj_back_both=True, **bss_kwargs) # adjust start time to remove the projection back runtime_bss += runtime_pb else: Y = bss_algorithms[bss_algo_name](X, n_iter=n_iter_p_ch * n_channels, proj_back=False, **bss_kwargs) runtime_bss = time.perf_counter() - runtime_bss results = [{ "bss_runtime": { "bss_algo": bss_algo, "runtime": runtime_bss, } }] t = { "room_id": room_id, "n_channels": n_channels, "bss_algo": bss_algo, "proj_algo": None, "runtime": 0.0, "sdr": None, "sir": None, "p": None, "q": None, "n_iter": 1, } # Evaluation of raw signal t["proj_algo"] = "None" y, sdr, sir, _ = reconstruct_evaluate(ref, Y, nfft, hop, win=win_s, si_metric=config["si_metric"]) t["sdr"], t["sir"] = sdr.tolist(), sir.tolist() results.append(t.copy()) # projection back t["proj_algo"] = "projection_back" if bss_algo in dereverb_algos: Z = Y_pb else: runtime_pb = time.perf_counter() Z = bss_scale.projection_back(Y, X[:, :, ref_mic]) runtime_pb = time.perf_counter() - runtime_pb y, sdr, sir, _ = reconstruct_evaluate(ref, Z, nfft, hop, win=win_s, si_metric=config["si_metric"]) t["sdr"], t["sir"] = sdr.tolist(), sir.tolist() t["runtime"] = runtime_pb results.append(t.copy()) # minimum distortion lo, hi, n_steps = config["minimum_distortion"]["p_list"] p_vals = np.linspace(lo, hi, n_steps) kwargs = config["minimum_distortion"]["kwargs"] for ip, p in enumerate(p_vals): for q in p_vals[ip:]: t["p"], t["q"], t["proj_algo"] = ( f"{p:.1f}", f"{q:.1f}", "minimum_distortion", ) runtime_md = time.perf_counter() Z, t["n_iter"] = bss_scale.minimum_distortion(Y, X[:, :, ref_mic], p=p, q=q, **kwargs) runtime_md = time.perf_counter() - runtime_md y, sdr, sir, _ = reconstruct_evaluate( ref, Z, nfft, hop, win=win_s, si_metric=config["si_metric"]) t["sdr"] = sdr.tolist() t["sir"] = sir.tolist() t["runtime"] = runtime_md results.append(t.copy()) return results