def convergence_callback(Y, **kwargs): global SDR, SIR, ref t_enter = time.perf_counter() from mir_eval.separation import bss_eval_sources # projection back z = projection_back(Y, X_mics[:, :, 0]) Y = Y.copy() * np.conj(z[None, :, :]) if Y.shape[2] == 1: y = pra.transform.synthesis(Y[:, :, 0], framesize, hop, win=win_s)[:, None] else: y = pra.transform.synthesis(Y, framesize, hop, win=win_s) y = y[framesize - hop :, :].astype(np.float64) if args.algo != "blinkiva": new_ord = np.argsort(np.std(y, axis=0))[::-1] y = y[:, new_ord] m = np.minimum(y.shape[0], ref.shape[1]) sdr, sir, sar, perm = bss_eval_sources(ref[:, :m], y[:m, [0, 0]].T) SDR.append(sdr) SIR.append(sir) t_exit = time.perf_counter() eval_time.append(t_exit - t_enter)
def blinkiva(X, U, n_src=None, sparse_reg=0., n_iter=20, n_nmf_sub_iter=4, proj_back=True, W0=None, R0=None, seed=None, print_cost=False, return_filters=False, callback=None): ''' Implementation of BlinkIVA algorithm for BSS presented by Robin Scheibler, Nobutaka Ono at ICASSP 2019, title to be determined Parameters ---------- X: ndarray (nframes, nfrequencies, nchannels) STFT representation of the signal U: ndarray (nframes, nblinkies) The matrix containing the blinky signals n_src: int, optional The number of sources or independent components n_iter: int, optional The number of iterations (default 20) proj_back: bool, optional Scaling on first mic by back projection (default True) W0: ndarray (nfrequencies, nchannels, nchannels), optional Initial value for demixing matrix return_filters: bool If true, the function will return the demixing matrix too callback: func A callback function called every 10 iterations, allows to monitor convergence Returns ------- Returns an (nframes, nfrequencies, nsources) array. Also returns the demixing matrix (nfrequencies, nchannels, nsources) if ``return_values`` keyword is True. ''' n_frames, n_freq, n_chan = X.shape _, n_blink = U.shape assert n_frames == _, 'Microphones and blinkies do not have the same number of frames ({} != {})'.format( n_frames, _) # default to determined case if n_src is None: n_src = X.shape[2] # initialize the demixing matrices if W0 is None: W = np.array([np.eye(n_chan, n_chan) for f in range(n_freq)], dtype=X.dtype) else: W = W0.copy() # we will threshold entries of R and G that get to small machine_epsilon = np.finfo(float).eps # U normalized by number of frequencies is used U_mean = U / n_freq / 2 I = np.eye(n_chan, n_chan) Y = np.zeros((n_frames, n_freq, n_chan), dtype=X.dtype) V = np.zeros((n_freq, n_chan, n_chan, n_chan), dtype=X.dtype) P = np.zeros((n_frames, n_chan)) if seed is not None: rng_state = np.random.get_state() np.random.seed(seed) # initialize the parts of NMF if R0 is None: R_all = np.ones((n_frames, n_chan)) R = R_all[:, :n_src] # subset tied to NMF of blinkies R[:, :] = 0.1 + 0.9 * np.random.rand(n_frames, n_src) R *= np.mean(np.abs(X[:, :, :n_src])**2, axis=(0, 1)) / R.sum(axis=0) else: R_all = R0.copy() R = R_all[:, :n_src] # subset tied to NMF of blinkies G = 0.1 + 0.9 * np.random.rand(n_src, n_blink) G *= np.mean(U_mean) / np.mean(G) if seed is not None: np.random.set_state(rng_state) # Compute the demixed output def demix(Y, X, W, P, R_all): for f in range(n_freq): Y[:, f, :] = np.dot(X[:, f, :], np.conj(W[f, :, :])) # The sources magnitudes averaged over frequencies # shape: (n_frames, n_src) P[:, :] = np.linalg.norm(Y, axis=1) # copy activations for sources not tied to NMF R_all[:, n_src:] = (P[:, n_src:] / n_freq)**2 def R_update(P, U, R, G): # Update the activations U_hat = np.dot(R, G) U_hat_I = 1. / U_hat R_I = 1. / R num = (0.5 * (P[:, :n_src] / n_freq) * R_I**1.5 + np.dot(U_mean * U_hat_I**2, G.T)) denom = (0.5 * R_I + np.dot(U_hat_I, G.T) + sparse_reg) R *= np.sqrt(num / denom) R[R < machine_epsilon] = machine_epsilon def G_update(U, R, G): U_hat = np.dot(R, G) U_hat_I = 1. / U_hat num = np.dot(R.T, U_mean * U_hat_I**2) denom = np.dot(R.T, U_hat_I) G *= np.sqrt(num / denom) G[G < machine_epsilon] = machine_epsilon def cost_function(Y, U, R, G, W): U_hat = np.dot(R[:, :G.shape[0]], G) cf = -Y.shape[0] * np.sum(np.linalg.slogdet(W)[1]) cf += np.sum(0.5 * Y.shape[1] * np.log(R) + np.linalg.norm(Y, axis=1) / R**0.5) cf += np.sum(Y.shape[1] * np.log(U_hat) + U / U_hat / 2) return cf cost_func_list = [] # initial demixing demix(Y, X, W, P, R_all) # NMF + IVA for epoch in range(n_iter): if callback is not None and epoch % 10 == 0: if proj_back: z = projection_back(Y, X[:, :, 0]) callback(Y * np.conj(z[None, :, :])) else: callback(Y, extra=[W, G, R, X, U]) if print_cost and epoch % 5 == 0: cost_func_list.append(cost_function(Y, U, R_all, G, W)) print('epoch:', epoch, 'cost function ==', cost_func_list[-1]) # several subiteration of NMF for sub_epoch in range(n_nmf_sub_iter): # Update the activations R_update(P, U, R, G) # Update the gains G_update(U, R, G) # Rescale all variables before continuing lmb = 1. / np.mean(R_all, axis=0) R_all *= lmb[None, :] W *= np.sqrt(lmb[None, None, :]) P *= np.sqrt(lmb[None, :]) G /= lmb[:n_src, None] # Compute Auxiliary Variable # shape: (n_freq, n_src, n_mic, n_mic) denom = 4 * P * R_all**0.5 # / n_freq # when I add this, separation improves a lot! denom[denom < 1.] = 1. V = np.mean( (X[:, :, None, :, None] / denom[:, None, :, None, None]) * np.conj(X[:, :, None, None, :]), axis=0, ) # Update the demixing matrices for s in range(n_chan): W_H = np.conj(np.swapaxes(W, 1, 2)) WV = np.matmul(W_H, V[:, s, :, :]) rhs = I[None, :, s][[0] * WV.shape[0], :] W[:, :, s] = np.linalg.solve(WV, rhs) P1 = np.conj(W[:, :, s]) P2 = np.sum(V[:, s, :, :] * W[:, None, :, s], axis=-1) W[:, :, s] /= np.sqrt(np.sum(P1 * P2, axis=1))[:, None] demix(Y, X, W, P, R_all) # Rescale all variables before continuing lmb = 1. / np.mean(R_all, axis=0) R_all *= lmb[None, :] W *= np.sqrt(lmb[None, None, :]) P *= np.sqrt(lmb[None, :]) G /= lmb[:n_src, None] if proj_back: z = projection_back(Y, X[:, :, 0]) Y *= np.conj(z[None, :, :]) if return_filters: return Y, W, G, R_all else: return Y
X_mics, n_iter=n_iter, proj_back=False, model=args.dist, init_eig=(args.init == init_choices[1]), callback=convergence_callback, callback_checkpoints=callback_checkpoints, ) else: raise ValueError("No such algorithm {}".format(args.algo)) # Last evaluation of SDR/SIR convergence_callback(Y) # projection back z = projection_back(Y, X_mics[:, :, 0]) Y *= np.conj(z[None, :, :]) toc = time.perf_counter() tot_eval_time = sum(eval_time) print("Processing time: {:8.3f} s".format(toc - tic - tot_eval_time)) print("Evaluation time: {:8.3f} s".format(tot_eval_time)) # Run iSTFT if Y.shape[2] == 1: y = pra.transform.synthesis(Y[:, :, 0], framesize, hop, win=win_s)[:, None] else: y = pra.transform.synthesis(Y, framesize, hop, win=win_s) y = y[framesize - hop :, :].astype(np.float64)
def auxiva_gauss(X, n_src=None, n_iter=20, proj_back=True, W0=None, f_contrast=None, f_contrast_args=[], return_filters=False, callback=None): ''' Implementation of AuxIVA algorithm for BSS presented in N. Ono, *Stable and fast update rules for independent vector analysis based on auxiliary function technique*, Proc. IEEE, WASPAA, pp. 189-192, September, 2011. This version uses time-varying Gauss source model. Parameters ---------- X: ndarray (nframes, nfrequencies, nchannels) STFT representation of the signal n_src: int, optional The number of sources or independent components n_iter: int, optional The number of iterations (default 20) proj_back: bool, optional Scaling on first mic by back projection (default True) W0: ndarray (nfrequencies, nchannels, nchannels), optional Initial value for demixing matrix f_contrast: dict of functions A dictionary with two elements 'f' and 'df' containing the contrast function taking 3 arguments This should be a ufunc acting element-wise on any array return_filters: bool If true, the function will return the demixing matrix too callback: func A callback function called every 10 iterations, allows to monitor convergence Returns ------- Returns an (nframes, nfrequencies, nsources) array. Also returns the demixing matrix (nfrequencies, nchannels, nsources) if ``return_values`` keyword is True. ''' n_frames, n_freq, n_chan = X.shape # default to determined case if n_src is None: n_src = X.shape[2] # for now, only supports determined case assert n_chan == n_src # initialize the demixing matrices if W0 is None: W = np.array([np.eye(n_chan, n_src) for f in range(n_freq)], dtype=X.dtype) else: W = W0.copy() if f_contrast is None: f_contrast = f_contrasts['norm'] f_contrast_args = [1, 1] I = np.eye(n_src, n_src) Y = np.zeros((n_frames, n_freq, n_src), dtype=X.dtype) V = np.zeros((n_freq, n_src, n_chan, n_chan), dtype=X.dtype) r = np.zeros((n_frames, n_src)) G_r = np.zeros((n_frames, n_src)) # Compute the demixed output def demix(Y, X, W): for f in range(n_freq): Y[:, f, :] = np.dot(X[:, f, :], np.conj(W[f, :, :])) for epoch in range(n_iter): demix(Y, X, W) if callback is not None and epoch % 10 == 0: if proj_back: z = projection_back(Y, X[:, :, 0]) callback(Y * np.conj(z[None, :, :])) else: callback(Y) # simple loop as a start # shape: (n_frames, n_src) r[:, :] = np.mean(np.abs(Y * np.conj(Y)), axis=1) # Apply derivative of contrast function G_r[:, :] = 1. / r / 2. # shape (n_frames, n_src) # Compute Auxiliary Variable for f in range(n_freq): for s in range(n_src): V[f, s, :, :] = (np.dot(G_r[None, :, s] * X[:, f, :].T, np.conj(X[:, f, :]))) / X.shape[0] # Update now the demixing matrix for f in range(n_freq): for s in range(n_src): WV = np.dot(np.conj(W[f, :, :].T), V[f, s, :, :]) W[f, :, s] = np.linalg.solve(WV, I[:, s]) W[f, :, s] /= np.sqrt( np.inner(np.conj(W[f, :, s]), np.dot(V[f, s, :, :], W[f, :, s]))) demix(Y, X, W) if proj_back: z = projection_back(Y, X[:, :, 0]) Y *= np.conj(z[None, :, :]) if return_filters: return Y, W else: return Y
def ogive( X, n_iter=4000, step_size=0.1, tol=1e-3, update="demix", proj_back=True, W0=None, model="laplace", init_eig=False, return_filters=False, callback=None, callback_checkpoints=[], ): """ Implementation of Orthogonally constrained Independent Vector Extraction (OGIVE) described in Z. Koldovský and P. Tichavský, “Gradient Algorithms for Complex Non-Gaussian Independent Component/Vector Extraction, Question of Convergence,” IEEE Trans. Signal Process., pp. 1050–1064, Dec. 2018. Parameters ---------- X: ndarray (nframes, nfrequencies, nchannels) STFT representation of the signal n_src: int, optional The number of sources or independent components n_iter: int, optional The number of iterations (default 20) step_size: float The step size of the gradient ascent tol: float Stop when the gradient is smaller than this number update: str Selects update of the mixing or demixing matrix, or a switching scheme, possible values: "mix", "demix", "switching" proj_back: bool, optional Scaling on first mic by back projection (default True) W0: ndarray (nfrequencies, nsrc, nchannels), optional Initial value for demixing matrix model: str The model of source distribution 'gauss' or 'laplace' (default) init_eig: bool, optional (default ``False``) If ``True``, and if ``W0 is None``, then the weights are initialized using the principal eigenvectors of the covariance matrix of the input data. return_filters: bool If true, the function will return the demixing matrix too callback: func A callback function called every 10 iterations, allows to monitor convergence callback_checkpoints: list of int A list of epoch number when the callback should be called Returns ------- Returns an (nframes, nfrequencies, nsources) array. Also returns the demixing matrix (nfrequencies, nchannels, nsources) if ``return_values`` keyword is True. """ n_frames, n_freq, n_chan = X.shape n_src = 1 # covariance matrix of input signal (n_freq, n_chan, n_chan) Cx = np.mean(X[:, :, :, None] * np.conj(X[:, :, None, :]), axis=0) Cx_inv = np.linalg.inv(Cx) Cx_norm = np.linalg.norm(Cx, axis=(1, 2)) w = np.zeros((n_freq, n_chan, 1), dtype=X.dtype) a = np.zeros((n_freq, n_chan, 1), dtype=X.dtype) delta = np.zeros((n_freq, n_chan, 1), dtype=X.dtype) lambda_a = np.zeros((n_freq, 1, 1), dtype=np.float64) def tensor_H(T): return np.conj(T).swapaxes(1, 2) # eigenvectors of the input covariance eigval, eigvec = np.linalg.eig(Cx) lead_eigval = np.max(eigval, axis=1) lead_eigvec = np.zeros((n_freq, n_chan), dtype=Cx.dtype) for f in range(n_freq): ind = np.argmax(eigval[f]) lead_eigvec[f, :] = eigvec[f, :, ind] # initialize A and W if W0 is None: if init_eig: # Initialize the demixing matrices with the principal # eigenvector w[:, :, 0] = lead_eigvec else: # Or with identity w[:, 0] = 1.0 else: w[:, :] = W0 def update_a_from_w(I): v_new = Cx[I] @ w[I] lambda_w = 1.0 / np.real(tensor_H(w[I]) @ v_new) a[I, :, :] = lambda_w * v_new def update_w_from_a(I): v_new = Cx_inv @ a lambda_a[:] = 1.0 / np.real(tensor_H(a) @ v_new) w[I, :, :] = lambda_a[I] * v_new[I] def switching_criterion(): a_n = a / a[:, :1, :1] b_n = Cx @ a_n lmb = b_n[:, :1, :1].copy() # copy is important here! b_n /= lmb p1 = np.linalg.norm(a_n - b_n, axis=(1, 2)) / Cx_norm Cbb = (lmb * (b_n @ tensor_H(b_n)) / np.linalg.norm(b_n, axis=(1, 2), keepdims=True)**2) p2 = np.linalg.norm(Cx - Cbb, axis=(1, 2)) kappa = p1 * p2 / np.sqrt(n_chan) thresh = 0.1 I_do_a[:] = kappa >= thresh I_do_w[:] = kappa < thresh # Compute the demixed output def demix(Y, X, W): Y[:, :, :] = X @ np.conj(W) # The very first update of a update_a_from_w(np.ones(n_freq, dtype=np.bool)) if update == "mix": I_do_w = np.zeros(n_freq, dtype=np.bool) I_do_a = np.ones(n_freq, dtype=np.bool) else: # default is "demix" I_do_w = np.ones(n_freq, dtype=np.bool) I_do_a = np.zeros(n_freq, dtype=np.bool) r_inv = np.zeros((n_frames, n_src)) r = np.zeros((n_frames, n_src)) # Things are more efficient when the frequencies are over the first axis Y = np.zeros((n_freq, n_frames, n_src), dtype=X.dtype) X_ref = X # keep a reference to input signal X = X.swapaxes(0, 1).copy() # more efficient order for processing for epoch in range(n_iter): # compute the switching criterion if update == "switching" and epoch % 10 == 0: switching_criterion() # Extract the target signal demix(Y, X, w) # Now run any necessary callback if callback is not None and epoch in callback_checkpoints: Y_tmp = Y.swapaxes(0, 1).copy() if proj_back: z = projection_back(Y_tmp, X_ref[:, :, 0]) callback(Y_tmp * np.conj(z[None, :, :])) else: callback(Y_tmp) # simple loop as a start # shape: (n_frames, n_src) if model == "laplace": r[:, :] = np.linalg.norm(Y, axis=0) / np.sqrt(n_freq) elif model == "gauss": r[:, :] = (np.linalg.norm(Y, axis=0)**2) / n_freq eps = 1e-15 r[r < eps] = eps r_inv[:, :] = 1.0 / r # Compute the score function psi = r_inv[None, :, :] * np.conj(Y) # "Nu" in Algo 3 in [1] # shape (n_freq, 1, 1) zeta = Y.swapaxes(1, 2) @ psi x_psi = (X.swapaxes(1, 2) @ psi) / zeta # The w-step # shape (n_freq, n_chan, 1) delta[I_do_w] = a[I_do_w] - x_psi[I_do_w] w[I_do_w] += step_size * delta[I_do_w] # The a-step # shape (n_freq, n_chan, 1) delta[I_do_a] = w[I_do_a] - ( Cx_inv[I_do_a] @ x_psi[I_do_a]) * lambda_a[I_do_a] a[I_do_a] += step_size * delta[I_do_a] # Apply the orthogonal constraints update_a_from_w(I_do_w) update_w_from_a(I_do_a) max_delta = np.max(np.linalg.norm(delta, axis=(1, 2))) if max_delta < tol: break # Extract target demix(Y, X, w) Y = Y.swapaxes(0, 1).copy() X = X.swapaxes(0, 1) if proj_back: z = projection_back(Y, X_ref[:, :, 0]) Y *= np.conj(z[None, :, :]) if return_filters: return Y, w else: return Y
def blinkiva_gauss(X, U, n_src=None, n_iter=20, n_nmf_sub_iter=20, proj_back=True, W0=None, R0=None, seed=None, epsilon=0.5, sparse_reg=0., print_cost=False, return_filters=False, callback=None): ''' Implementation of BlinkIVA algorithm for blind source separation using jointly microphones and sound power sensors "blinkies". The algorithm was presented in R. Scheibler and N. Ono, *Multi-modal Blind Source Separation with Microphones and Blinkies,* Proc. IEEE ICASSP, Brighton, UK, May, 2019. DOI: 10.1109/ICASSP.2019.8682594 https://arxiv.org/abs/1904.02334 Parameters ---------- X: ndarray (nframes, nfrequencies, nchannels) STFT representation of the signal U: ndarray (nframes, nblinkies) The matrix containing the blinky signals n_src: int, optional The number of sources or independent components n_iter: int, optional The number of iterations (default 20) n_nmf_sub_iter: int, optional The number of NMF iteration to run between two updates of the demixing matrices (default 20) proj_back: bool, optional Scaling on first mic by back projection (default True) W0: ndarray (nfrequencies, nchannels, nchannels), optional Initial value for demixing matrix R0: ndarray (nframes, nsrc), optional Initial value of the activations seed: int, optional A seed to make deterministic the random initialization of NMF parts, when None (default), the random number generator is used in its current state epsilon: float, optional A regularization value to prevent too large values after the division sparse_reg: float A regularization term to make the activation matrix sparse print_cost: bool, optional Print the value of the cost function at each iteration return_filters: bool, optional If true, the function will return the demixing matrix, gains, and activations too callback: func A callback function called every 10 iterations, allows to monitor convergence Returns ------- Returns an (nframes, nfrequencies, nsources) array. Also returns the demixing matrix (nfrequencies, nchannels, nsources), gains (nsrc, nblinkies), and activations (nframes, nchannels) if ``return_filters`` keyword is True. ''' n_frames, n_freq, n_chan = X.shape _, n_blink = U.shape if _ != n_frames: raise ValueError('The microphones and blinky signals should have the same number of frames') # default to determined case if n_src is None: n_src = X.shape[2] # initialize the demixing matrices if W0 is None: W = np.array([np.eye(n_chan, n_chan) for f in range(n_freq)], dtype=X.dtype) else: W = W0.copy() # we will threshold entries of R and G that get to small machine_epsilon = np.finfo(float).eps # we will only work with the blinky signal normalized by frequency U_mean = U / n_freq / 2. I = np.eye(n_chan,n_chan) Y = np.zeros((n_frames, n_freq, n_chan), dtype=X.dtype) V = np.zeros((n_freq, n_chan, n_chan, n_chan), dtype=X.dtype) P = np.zeros((n_frames, n_chan)) # initialize the parts of NMF R_all = np.ones((n_frames, n_chan)) R = R_all[:,:n_src] # subset tied to NMF of blinkies if seed is not None: rng_state = np.random.get_state() np.random.seed(seed) if R0 is None: R[:,:] = 0.1 + 0.9 * np.random.rand(n_frames, n_src) R /= np.mean(R, axis=0, keepdims=True) else: R[:,:] = R0 G = 0.1 + 0.9 * np.random.rand(n_src, n_blink) U_hat = np.dot(R, G) G *= np.sum(U_mean, axis=0, keepdims=True) / np.sum(U_hat, axis=0, keepdims=True) if seed is not None: np.random.set_state(rng_state) def cost(Y, W, R, G): pwr = np.linalg.norm(Y, axis=1) ** 2 cf1 = -2 * Y.shape[0] * np.sum(np.linalg.slogdet(W)[1]) cf2 = np.sum(Y.shape[1] * np.log(R) + pwr / R) U_hat = np.dot(R[:,:n_src], G) cf3 = np.sum(np.log(U_hat) + U / U_hat / 2) return { 'iva' : cf1 + cf2, 'nmf' : cf2 + cf3, 'blinkiva' : cf1 + cf2 + cf3 } def rescale(W, P, R, G): # Fix the scale of all variables lmb = 1. / np.mean(R, axis=0) R *= lmb[None,:] P *= lmb[None,:] W *= np.sqrt(lmb[None,None,:]) G /= lmb[:G.shape[0],None] # Compute the demixed output def demix(Y, X, W, P, R_all): for f in range(n_freq): Y[:,f,:] = np.dot(X[:,f,:], np.conj(W[f,:,:])) # The sources magnitudes averaged over frequencies # shape: (n_frames, n_src) P[:,:] = np.linalg.norm(Y, axis=1) ** 2 / n_freq # copy activations for sources not tied to NMF R_all[:,n_src:] = P[:,n_src:] # initial demixing demix(Y, X, W, P, R_all) cost_joint_list = [] # NMF + IVA joint updates for epoch in range(n_iter): if callback is not None and epoch % 10 == 0: if proj_back: z = projection_back(Y, X[:,:,0]) callback(Y * np.conj(z[None,:,:])) else: callback(Y, extra=[W,G,R,X,U]) if print_cost and epoch % 10 == 0: print('Cost function: iva={iva:13.0f} nmf={nmf:13.0f} iva+nmf={blinkiva:13.0f}'.format( **cost(Y, W, R_all, G))) for sub_epoch in range(n_nmf_sub_iter): # Update the activations U_hat = np.dot(R, G) U_hat_I = 1. / U_hat R_I = 1. / R R *= np.sqrt( (P[:,:n_src] * R_I ** 2 + np.dot(U_mean * U_hat_I ** 2, G.T)) / (R_I + np.dot(U_hat_I, G.T) + sparse_reg) ) R[R < machine_epsilon] = machine_epsilon # Update the gains U_hat = np.dot(R, G) U_hat_I = 1. / U_hat G *= np.sqrt( np.dot(R.T, U_mean * U_hat_I ** 2) / np.dot(R.T, U_hat_I) ) G[G < machine_epsilon] = machine_epsilon # normalize #rescale(W, P, R_all, G) # Compute Auxiliary Variable # shape: (n_freq, n_src, n_mic, n_mic) denom = 2 * R_all denom[denom < epsilon] = epsilon # regularize this part # 1) promote all arrays to (n_frames, n_freq, n_chan, n_chan, n_chan) # 2) take the outer product (complex) of the last two dimensions # to get the covariance matrix over the microphones # 3) average over the time frames (index 0) V = np.mean( (X[:,:,None,:,None] / denom[:,None,:,None,None]) * np.conj(X[:,:,None,None,:]), axis=0, ) # Update now the demixing matrix #for s in range(n_src): for s in range(n_chan): W_H = np.conj(np.swapaxes(W, 1, 2)) WV = np.matmul(W_H, V[:,s,:,:]) rhs = I[None,:,s][[0] * WV.shape[0],:] W[:,:,s] = np.linalg.solve(WV, rhs) P1 = np.conj(W[:,:,s]) P2 = np.sum(V[:,s,:,:] * W[:,None,:,s], axis=-1) W[:,:,s] /= np.sqrt(np.sum(P1 * P2, axis=1))[:,None] demix(Y, X, W, P, R_all) # Rescale all variables before continuing rescale(W, P, R_all, G) if proj_back: z = projection_back(Y, X[:,:,0]) Y *= np.conj(z[None,:,:]) if return_filters: return Y, W, G, R_all else: return Y
def overiva( X, n_src=None, n_iter=20, proj_back=True, W0=None, model="laplace", init_eig=False, return_filters=False, callback=None, ): """ Implementation of overdetermined IVA algorithm for BSS as presented. See the following publication for a detailed description of the algorithm. R. Scheibler and N. Ono, Independent Vector Analysis with more Microphones than Sources, arXiv, 2019. https://arxiv.org/abs/1905.07880 Parameters ---------- X: ndarray (nframes, nfrequencies, nchannels) STFT representation of the signal n_src: int, optional The number of sources or independent components. When ``n_src==nchannels``, the algorithms is identical to AuxIVA. When ``n_src==1``, then it is doing independent vector extraction. n_iter: int, optional The number of iterations (default 20) proj_back: bool, optional Scaling on first mic by back projection (default True) W0: ndarray (nfrequencies, nsrc, nchannels), optional Initial value for demixing matrix model: str The model of source distribution 'gauss' or 'laplace' (default) init_eig: bool, optional (default ``False``) If ``True``, and if ``W0 is None``, then the weights are initialized using the principal eigenvectors of the covariance matrix of the input data. return_filters: bool If true, the function will return the demixing matrix too callback: func A callback function called every 10 iterations, allows to monitor convergence Returns ------- Returns an (nframes, nfrequencies, nsources) array. Also returns the demixing matrix (nfrequencies, nchannels, nsources) if ``return_values`` keyword is True. """ n_frames, n_freq, n_chan = X.shape # default to determined case if n_src is None: n_src = n_chan # covariance matrix of input signal (n_freq, n_chan, n_chan) Cx = np.mean(X[:, :, :, None] * np.conj(X[:, :, None, :]), axis=0) W_hat = np.zeros((n_freq, n_chan, n_chan), dtype=X.dtype) W = W_hat[:, :, :n_src] J = W_hat[:, :n_src, n_src:] def tensor_H(T): return np.conj(T).swapaxes(1, 2) def update_J_from_orth_const(): tmp = np.matmul(tensor_H(W), Cx) J[:, :, :] = np.linalg.solve(tmp[:, :, :n_src], tmp[:, :, n_src:]) # initialize A and W if W0 is None: if init_eig: # Initialize the demixing matrices with the principal # eigenvectors of the input covariance v, w = np.linalg.eig(Cx) for f in range(n_freq): ind = np.argsort(v[f])[-n_src:] W[f, :, :] = np.conj(w[f][:, ind]) else: # Or with identity for f in range(n_freq): W[f, :n_src, :] = np.eye(n_src) else: W[:, :, :] = W0 # We still need to initialize the rest of the matrix if n_src < n_chan: update_J_from_orth_const() for f in range(n_freq): W_hat[f, n_src:, n_src:] = -np.eye(n_chan - n_src) eyes = np.tile(np.eye(n_chan, n_chan), (n_freq, 1, 1)) V = np.zeros((n_freq, n_chan, n_chan), dtype=X.dtype) r_inv = np.zeros((n_frames, n_src)) r = np.zeros((n_frames, n_src)) # Things are more efficient when the frequencies are over the first axis Y = np.zeros((n_freq, n_frames, n_src), dtype=X.dtype) X = X.swapaxes(0, 1).copy() # Compute the demixed output def demix(Y, X, W): Y[:, :, :] = X @ np.conj(W) for epoch in range(n_iter): demix(Y, X, W) if callback is not None and epoch % 10 == 0: Y_tmp = Y.swapaxes(0, 1) if proj_back: z = projection_back(Y_tmp, X[:, :, 0].swapaxes(0, 1)) callback(Y_tmp * np.conj(z[None, :, :])) else: callback(Y_tmp) # simple loop as a start # shape: (n_frames, n_src) if model == 'laplace': r[:, :] = (2. * np.linalg.norm(Y, axis=0)) elif model == 'gauss': r[:, :] = (np.linalg.norm(Y, axis=0)**2) / n_freq # set the scale of r gamma = r.mean(axis=0) r /= gamma[None, :] if model == 'laplace': Y /= gamma[None, None, :] W /= gamma[None, None, :] elif model == 'gauss': g_sq = np.sqrt(gamma[None, None, :]) Y /= g_sq W /= g_sq # ensure some numerical stability eps = 1e-15 r[r < eps] = eps r_inv[:, :] = 1. / r # Update now the demixing matrix for s in range(n_src): # Compute Auxiliary Variable # shape: (n_freq, n_chan, n_chan) V[:, :, :] = (X.swapaxes(1, 2) * r_inv[None, None, :, s]) @ np.conj(X) / n_frames WV = np.conj(W_hat).swapaxes(1, 2) @ V W[:, :, s] = np.linalg.solve(WV, eyes[:, :, s]) # normalize denom = np.conj(W[:, None, :, s]) @ V[:, :, :] @ W[:, :, None, s] W[:, :, s] /= np.sqrt(denom[:, :, 0]) # Update the mixing matrix according to orthogonal constraints if n_src < n_chan: update_J_from_orth_const() demix(Y, X, W) Y = Y.swapaxes(0, 1).copy() X = X.swapaxes(0, 1) if proj_back: z = projection_back(Y, X[:, :, 0]) Y *= np.conj(z[None, :, :]) if return_filters: return Y, W else: return Y