def compute_mask(mixture, targets_list, mask_type): """ Arguments: mixture: STFT of mixture signal(complex result) targets_list: python list of target signal's STFT results(complex result) mask_type: ["irm", "ibm", "iam", "psm"] Return: masks_list """ if mask_type == "ibm": max_index = np.argmax( np.stack([cmat_abs(mat) for mat in targets_list]), 0) return [max_index == s for s in range(len(targets_list))] if mask_type == "irm": denominator = sum([cmat_abs(mat) for mat in targets_list]) + EPSILON else: denominator = cmat_abs(mixture) + EPSILON if mask_type != "psm": masks = [cmat_abs(mat) / denominator for mat in targets_list] else: mixture_phase = np.angle(mixture) masks = [ cmat_abs(mat) * np.cos(mixture_phase - np.angle(mat)) / denominator for mat in targets_list ] return masks
def compute_mask(speech, noise_or_mixture, mask): """ for signal model: y = x1 + x2 def f = STFT(x): f(y) = f(x1) + f(x2) => |f(y)| = |f(x1) + f(x2)| < |f(x1)| + |f(x2)| for irm: 1) M(x1) = |f(x1)| / (|f(x1)| + |f(x2)|) DongYu 2) M(x1) = |f(x1)| / sqrt(|f(x1)|^2 + |f(x2)|^2) DeliangWang s.t. 1 >= 2) >= 1) >= 0 for iam(FFT-mask, smm): M(x1) = |f(x1)| / |f(y)| = |f(x1)| / |f(x1) + f(x2)| in [0, oo] for psm: M(x1) = |f(x1) / f(y)| = |f(x1)| * cos(delta_phase) / |f(y)| """ if mask == "ibm": binary_mask = cmat_abs(speech) > cmat_abs(noise_or_mixture) return binary_mask.astype(np.float) # irm/iam/psm if mask == "irm": # denominator = cmat_abs(speech) + cmat_abs(noise_or_mixture) denominator = np.sqrt( cmat_abs(speech)**2 + cmat_abs(noise_or_mixture)**2) else: denominator = cmat_abs(noise_or_mixture) if mask == "psm": return cmat_abs(speech) * np.cos( np.angle(noise_or_mixture) - np.angle(speech)) / denominator elif mask == "psa": # keep nominator only return cmat_abs(speech) * np.cos( np.angle(noise_or_mixture) - np.angle(speech)) else: # irm/iam return cmat_abs(speech) / denominator
def compute_mask(tgt, mix, mask): """ for signal model: y = x1 + x2 def f = STFT(x): f(y) = f(x1) + f(x2) => |f(y)| = |f(x1) + f(x2)| < |f(x1)| + |f(x2)| for irm: 1) M(x1) = |f(x1)| / (|f(x1)| + |f(x2)|) DongYu 2) M(x1) = |f(x1)| / sqrt(|f(x1)|^2 + |f(x2)|^2) Deliang Wang s.t. 1 >= 2) >= 1) >= 0 for iam(FFT-mask, smm): M(x1) = |f(x1)| / |f(y)| = |f(x1)| / |f(x1) + f(x2)| in [0, oo] for psm: M(x1) = |f(x1) / f(y)| = |f(x1)| * cos(delta_phase) / |f(y)| for crm: M(x1) = f(x1) / f(y) """ # target speech tgt_abs = cmat_abs(tgt) # mixture mix_abs = cmat_abs(mix) # interference speech inf_abs = cmat_abs(mix - tgt) if mask == "ibm": return (tgt_abs > inf_abs).astype(np.float32) # irm/iam/psm if mask == "irm": # denominator = tgt_abs + inf_abs denominator = np.sqrt(tgt_abs**2 + inf_abs**2 + EPSILON) elif mask == "crm": denominator = mix + EPSILON else: denominator = mix_abs if mask == "psm": return tgt_abs * np.cos(np.angle(mix) - np.angle(tgt)) / denominator # phase sensitive amplitude elif mask == "psa": # keep nominator only non_neg = np.maximum(0, np.cos(np.angle(mix) - np.angle(tgt))) return tgt_abs * non_neg elif mask == "crm": # stack real/imag part cpx_mask = tgt / denominator return np.hstack( [tangent(np.real(cpx_mask)), tangent(np.imag(cpx_mask))]) else: # irm/iam return tgt_abs / denominator
def compute_vad_masks(spectrogram, proportion): """ We ignore several minimum values and keep proportion*100% energy Arguments: spectrogram: F x T Return: vad_mask: T x F """ energy_mat = cmat_abs(spectrogram) energy_vec = np.sort(energy_mat.flatten()) filter_energy = np.sum(energy_vec) * (1 - proportion) threshold = 0 cumsum, index = 0, 0 while index < energy_vec.shape[0]: threshold = energy_vec[index] cumsum += threshold if cumsum > filter_energy: break index += 1 # silence if 1 vad_mask = (energy_mat < threshold) return vad_mask.transpose(), index