def get_istft(spect, wsize=512, tstep=256, L=None): """ reshape the spectrum and get the inverse Fourier transform """ if len(spect.shape) < 3: spect = spect.reshape((1, spect.shape[0], spect.shape[1])) if L is not None: return np.squeeze(stft.istft(spect, tstep, L)) return stft.istft(spect, tstep)
def __fdndlp(self, data): """Frequency-domain variance-normalized delayed liner prediction This is the core part of the WPE method. The variance-normalized linear prediciton algorithm is implemented in each frequency bin separately. Both the input and output signals are in time-domain. Args: data: A 2-dimension numpy array with shape=(chanels, samples) Returns: A 2-dimension numpy array with shape=(output_channels, samples) """ freq_data = stft.stft(data / np.abs(data).max(), frame_size=self.frame_size, overlap=self.overlap) self.freq_num = freq_data.shape[-1] drv_freq_data = freq_data[0:self.out_num].copy() for i in range(self.freq_num): xk = freq_data[:, :, i].T dk = self.__ndlp(xk) drv_freq_data[:, :, i] = dk.T drv_data = stft.istft(drv_freq_data, frame_size=self.frame_size, overlap=self.overlap) return drv_data / np.abs(drv_data).max()
def predict_channel(audio): length = np.shape(audio)[0] m = resample(audio, 44100, 22050) M = stft(m.reshape(-1, 1), hop_size, win_size, fft_size) Mmag = np.abs(M).T spec_frames, n_bins = Mmag.shape pad_size = int((n_frames - 1) / 2) Mmag = np.concatenate((np.zeros( (pad_size, n_bins)), Mmag, np.zeros((pad_size, n_bins)))) new_strides = (Mmag.strides[0], Mmag.strides[0], Mmag.strides[1]) Mmag = as_strided(Mmag, (spec_frames, n_frames, n_bins), new_strides) Mmag = Mmag[:, np.newaxis, :, :] vocals = np.zeros(M.T.shape) bass = np.zeros(M.T.shape) drums = np.zeros(M.T.shape) other = np.zeros(M.T.shape) for i in range(spec_frames): X = Mmag[i, :, :, :] in_data = torch.from_numpy( X.astype(np.float32)[np.newaxis, :, :, :]) if torch.cuda.is_available(): in_data = in_data.cuda() i_result = model(Variable(in_data)).cpu().data.numpy() vocals[i, :] = i_result[0, :n_bins] drums[i, :] = i_result[0, n_bins:2 * n_bins] bass[i, :] = i_result[0, 2 * n_bins:3 * n_bins] other[i, :] = i_result[0, 3 * n_bins:4 * n_bins] all_masks = vocals + bass + drums + other vocals = vocals / all_masks bass = bass / all_masks drums = drums / all_masks other = other / all_masks vocal_est = resample(istft(M * vocals.T, hop_size, win_size, 22050), 22050, 44100, 0)[:length, :] bass_est = resample(istft(M * bass.T, hop_size, win_size, 22050), 22050, 44100, 0)[:length, :] drums_est = resample(istft(M * drums.T, hop_size, win_size, 22050), 22050, 44100, 0)[:length, :] other_est = resample(istft(M * other.T, hop_size, win_size, 22050), 22050, 44100, 0)[:length, :] return (vocal_est, bass_est, drums_est, other_est)
def pvoc(x, sr, factor, Hs=512, window=signal.hann(1024, sym=False), phase_lock=False): in_size = x.shape[0] win_len = window.shape[0] win_len_half = int(np.round(win_len / 2)) out_size = int(np.ceil(factor * in_size)) anchor_points = np.array([[0, 0], [in_size - 1, out_size - 1]]) syn_positions = np.arange(0, out_size + win_len_half, Hs) an_positions = np.round( np.interp(syn_positions, anchor_points[:, 1], anchor_points[:, 0])) an_hops = np.concatenate(([0], an_positions[1:] - an_positions[:-1])) y = np.zeros((out_size + 2 * win_len)) x = np.concatenate((np.zeros( (win_len_half)), x, np.zeros((win_len + int(an_hops[1]))))) X = stft.stft(x, sr, an_positions, window, win_len) Y = np.zeros_like(X) Y[:, 0] = X[:, 0] #assuming columns are frames k = np.arange(win_len_half + 1).T omega = 2 * np.pi * k / win_len print(an_hops[1]) print(an_hops[-1]) for i in range(1, X.shape[1]): dphi = omega * an_hops[i] current_phase = np.angle(X[:, i]) prev_phase = np.angle(X[:, i - 1]) phase_inc = current_phase - prev_phase - dphi phase_inc = phase_inc - 2 * np.pi * np.round(phase_inc / (2 * np.pi)) ipa_sample = omega + phase_inc / an_hops[i] ipa_hop = ipa_sample * Hs syn_phase = np.angle(Y[:, i - 1]) if not phase_lock: theta = syn_phase + ipa_hop - current_phase phasor = np.exp(1j * theta) else: p, v = get_peaks(np.abs(X[:, i])) theta = np.zeros_like(Y[:, i]) for j in range(len(p)): theta[v[j]:v[j + 1]] = syn_phase[p[j]] + ipa_hop[ p[j]] - current_phase[p[j]] phasor = np.exp(1j * theta) Y[:, i] = phasor * X[:, i] y = stft.istft(Y, Hs, window) return y
def image2audio(image_filename, audio_filename): X_image = read_image(image_filename) print("show result of stft.") print(X_image.dtype, X_image.shape) r = inv_normal(X_image[:, :, 0].astype('float64') / MAX) g = inv_normal(X_image[:, :, 1].astype('float64') / MAX) X = P2R(r, g) # X = np.zeros(X_image.shape[:2], 'complex128') # X.real = r # X.imag = g print(np.max(X.real), np.min(X.real)) print(np.max(X.imag), np.min(X.imag)) # Compute the ISTFT. xhat = stft.istft(X, fs, T, hop_length) xhat = float2pcm(xhat) scipy.io.wavfile.write(audio_filename, fs, xhat)
def tf_agc(d, sr, t_scale=0.5, f_scale=1.0, causal_tracking=True, plot=False): """ Perform frequency-dependent automatic gain control on an auditory frequency axis. d is the input waveform (at sampling rate sr); y is the output waveform with approximately constant energy in each time-frequency patch. t_scale is the "scale" for smoothing in time (default 0.5 sec). f_scale is the frequency "scale" (default 1.0 "mel"). causal_tracking == 0 selects traditional infinite-attack, exponential release. causal_tracking == 1 selects symmetric, non-causal Gaussian-window smoothing. D returns actual STFT used in analysis. E returns the smoothed amplitude envelope divided out of D to get gain control. """ hop_size = 0.032 # in seconds # Make STFT on ~32 ms grid ftlen = int(2**np.round(np.log(hop_size * sr) / np.log(2.))) winlen = ftlen hoplen = winlen / 2 D = stft(d, winlen, hoplen) # using my code ftsr = sr / hoplen ndcols = D.shape[1] # Smooth in frequency on ~ mel resolution # Width of mel filters depends on how many you ask for, # so ask for fewer for larger f_scales nbands = max(10, 20 / f_scale) # 10 bands, or more for very fine f_scale mwidth = f_scale * nbands / 10 # will be 2.0 for small f_scale (f2a_tmp, _) = fft2melmx(ftlen, sr, int(nbands), mwidth) f2a = f2a_tmp[:, :ftlen / 2 + 1] audgram = np.dot(f2a, np.abs(D)) if causal_tracking: # traditional attack/decay smoothing fbg = np.zeros(audgram.shape) # state = zeros(size(audgram,1),1); state = np.zeros(audgram.shape[0]) alpha = np.exp(-(1. / ftsr) / t_scale) for i in range(audgram.shape[1]): state = np.maximum(alpha * state, audgram[:, i]) fbg[:, i] = state else: # noncausal, time-symmetric smoothing # Smooth in time with tapered window of duration ~ t_scale tsd = np.round(t_scale * ftsr) / 2 htlen = 6 * tsd # Go out to 6 sigma twin = np.exp(-0.5 * (((np.arange(-htlen, htlen + 1)) / tsd)**2)).T # reflect ends to get smooth stuff AD = audgram x = np.hstack((np.fliplr(AD[:, :htlen]), AD, np.fliplr(AD[:, -htlen:]), np.zeros((AD.shape[0], htlen)))) fbg = signal.lfilter(twin, 1, x, 1) # strip "warm up" points fbg = fbg[:, twin.size + np.arange(ndcols)] # map back to FFT grid, flatten bark loop gain sf2a = np.sum(f2a, 0) sf2a_fix = sf2a sf2a_fix[sf2a == 0] = 1. E = np.dot(np.dot(np.diag(1. / sf2a_fix), f2a.T), fbg) # Remove any zeros in E (shouldn't be any, but who knows?) E[E <= 0] = np.min(E[E > 0]) # invert back to waveform y = istft(D / E, winlen, hoplen, window=np.ones(winlen)) # using my code if plot: try: import matplotlib.pyplot as plt plt.subplot(3, 1, 1) plt.imshow(20. * np.log10(np.flipud(np.abs(D)))) plt.subplot(3, 1, 2) plt.imshow(20. * np.log10(np.flipud(np.abs(E)))) A = stft(y, winlen, hoplen) # using my code plt.subplot(3, 1, 3) plt.imshow(20. * np.log10(np.flipud(np.abs(A)))) plt.show() except Exception, e: print "Failed to plot results" print e
other = np.zeros(M.T.shape) for i in range(spec_frames): X = Mmag[i, :, :, :] in_data = torch.from_numpy(X.astype(np.float32)[np.newaxis, :, :, :]) if torch.cuda.is_available(): in_data = in_data.cuda() i_result = model(Variable(in_data)).cpu().data.numpy() vocals[i, :] = np.argmax(i_result, 1) == 0 drums[i, :] = np.argmax(i_result, 1) == 1 bass[i, :] = np.argmax(i_result, 1) == 2 other[i, :] = np.argmax(i_result, 1) == 3 sr = 22050 np.save("results/vocals/" + filename + "_mask", vocals.T) vocal_est = istft(M * vocals.T, hop_size, win_size, sr) wavfile.write("results/vocals/" + filename + "_target.wav", sr, vocal_est) np.save("results/bass/" + filename + "_mask", bass.T) bass_est = istft(M * bass.T, hop_size, win_size, sr) wavfile.write("results/bass/" + filename + "_target.wav", sr, bass_est) np.save("results/drums/" + filename + "_mask", drums.T) drums_est = istft(M * drums.T, hop_size, win_size, sr) wavfile.write("results/drums/" + filename + "_target.wav", sr, drums_est) np.save("results/other/" + filename + "_mask", other.T) other_est = istft(M * other.T, hop_size, win_size, sr) wavfile.write("results/other/" + filename + "_target.wav", sr, other_est)
def plot_mel_masks(args): # Arugments & parameters workspace = args.workspace holdout_fold = args.holdout_fold scene_type = args.scene_type snr = args.snr iteration = args.iteration model_type = args.model_type cuda = args.cuda labels = config.labels classes_num = len(labels) sample_rate = config.sample_rate window_size = config.window_size overlap = config.overlap hop_size = window_size-overlap mel_bins = config.mel_bins seq_len = config.seq_len ix_to_lb = config.ix_to_lb thres = 0.1 batch_size = 24 # Paths hdf5_path = os.path.join(workspace, 'features', 'logmel', 'scene_type={},snr={}'.format(scene_type, snr), 'development.h5') model_path = os.path.join(workspace, 'models', 'main_pytorch', 'model_type={}'.format(model_type), 'scene_type={},snr={}' ''.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold), 'md_{}_iters.tar'.format(iteration)) yaml_path = os.path.join(workspace, 'mixture.yaml') audios_dir = os.path.join(workspace, 'mixed_audios', 'scene_type={},snr={}'.format(scene_type, snr)) sep_wavs_dir = os.path.join(workspace, 'separated_wavs', 'main_pytorch', 'model_type={}'.format(model_type), 'scene_type={},snr={}'.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold)) create_folder(sep_wavs_dir) # Load yaml file load_yaml_time = time.time() with open(yaml_path, 'r') as f: meta = yaml.load(f) print('Load yaml file time: {:.3f} s'.format(time.time() - load_yaml_time)) feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, overlap=overlap, mel_bins=mel_bins) inverse_melW = feature_extractor.get_inverse_melW() # Load model Model = get_model(model_type) model = Model(classes_num, seq_len, mel_bins, cuda) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Data generator generator = InferenceDataGenerator( hdf5_path=hdf5_path, batch_size=batch_size, holdout_fold=holdout_fold) generate_func = generator.generate_validate( data_type='validate', shuffle=False, max_iteration=None) # Evaluate on mini-batch for (iteration, data) in enumerate(generate_func): (batch_x, batch_y, batch_audio_names) = data batch_x = move_data_to_gpu(batch_x, cuda) # Predict with torch.no_grad(): model.eval() (batch_output, batch_bottleneck) = model( batch_x, return_bottleneck=True) batch_output = batch_output.data.cpu().numpy() '''(batch_size, classes_num)''' batch_bottleneck = batch_bottleneck.data.cpu().numpy() '''(batch_size, classes_num, seq_len, mel_bins)''' batch_pred_sed = np.mean(batch_bottleneck, axis=-1) batch_pred_sed = np.transpose(batch_pred_sed, (0, 2, 1)) '''(batch_size, seq_len, classes_num)''' batch_gt_masks = [] for n in range(len(batch_audio_names)): curr_meta = search_meta_by_mixture_name(meta, batch_audio_names[n]) curr_events = curr_meta['events'] pred_indexes = np.where(batch_output[n] > thres)[0] gt_indexes = get_ground_truth_indexes(curr_events) gt_sed = get_sed_from_meta(curr_events) # (seq_len, classes_num) pred_sed = np.zeros((seq_len, classes_num)) pred_sed[:, pred_indexes] = batch_pred_sed[n][:, pred_indexes] # (seq_len, classes_num) (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n]) events_stft = np.dot(events_stft, feature_extractor.melW) scene_stft = np.dot(scene_stft, feature_extractor.melW) gt_mask = ideal_binary_mask(events_stft, scene_stft) # (seq_len, fft_size) gt_masks = gt_mask[:, :, None] * gt_sed[:, None, :] # (seq_len, fft_size, classes_num) gt_masks = gt_masks.astype(np.float32) batch_gt_masks.append(gt_masks) pred_masks = batch_bottleneck[n].transpose(1, 2, 0) # (seq_len, fft_size, classes_num) # Save out separated audio if True: curr_audio_name = curr_meta['mixture_name'] audio_path = os.path.join(audios_dir, curr_audio_name) (mixed_audio, fs) = read_audio(audio_path, target_fs=sample_rate, mono=True) out_wav_path = os.path.join(sep_wavs_dir, curr_audio_name) write_audio(out_wav_path, mixed_audio, sample_rate) window = np.hamming(window_size) mixed_stft_cmplx = stft(x=mixed_audio, window_size=window_size, hop_size=hop_size, window=window, mode='complex') mixed_stft_cmplx = mixed_stft_cmplx[0 : seq_len, :] mixed_stft = np.abs(mixed_stft_cmplx) for k in gt_indexes: masked_stft = np.dot(pred_masks[:, :, k], inverse_melW) * mixed_stft masked_stft_cmplx = real_to_complex(masked_stft, mixed_stft_cmplx) frames = istft(masked_stft_cmplx) cola_constant = get_cola_constant(hop_size, window) sep_audio = overlap_add(frames, hop_size, cola_constant) sep_wav_path = os.path.join(sep_wavs_dir, '{}_{}.wav'.format(os.path.splitext(curr_audio_name)[0], ix_to_lb[k])) write_audio(sep_wav_path, sep_audio, sample_rate) print('Audio wrote to {}'.format(sep_wav_path)) # Visualize learned representations if True: for n in range(len(batch_output)): # Plot segmentation masks. (00013.wav is used for plot in the paper) print('audio_name: {}'.format(batch_audio_names[n])) print('target: {}'.format(batch_y[n])) target_labels = target_to_labels(batch_y[n], labels) print('target labels: {}'.format(target_labels)) (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n]) fig, axs = plt.subplots(7, 7, figsize=(15, 10)) for k in range(classes_num): axs[k // 6, k % 6].matshow(batch_bottleneck[n, k].T, origin='lower', aspect='auto', cmap='jet') if labels[k] in target_labels: color = 'r' else: color = 'k' axs[k // 6, k % 6].set_title(labels[k], color=color) axs[k // 6, k % 6].xaxis.set_ticks([]) axs[k // 6, k % 6].yaxis.set_ticks([]) axs[k // 6, k % 6].set_xlabel('time') axs[k // 6, k % 6].set_ylabel('mel bins') axs[6, 5].matshow(np.log(events_stft + 1e-8).T, origin='lower', aspect='auto', cmap='jet') axs[6, 5].set_title('Spectrogram (in log scale)') axs[6, 5].xaxis.set_ticks([0, 310]) axs[6, 5].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[6, 5].xaxis.tick_bottom() axs[6, 5].yaxis.set_ticks([0, 1024]) axs[6, 5].yaxis.set_ticklabels(['0', '1025']) axs[6, 5].set_xlabel('time') axs[6, 5].set_ylabel('FFT bins') axs[6, 6].matshow(np.log(np.dot(events_stft, feature_extractor.melW) + 1e-8).T, origin='lower', aspect='auto', cmap='jet') axs[6, 6].set_title('Log mel pectrogram') axs[6, 6].xaxis.set_ticks([0, 310]) axs[6, 6].xaxis.set_ticklabels(['0.0', '10.0 s']) axs[6, 6].xaxis.tick_bottom() axs[6, 6].yaxis.set_ticks([0, 63]) axs[6, 6].yaxis.set_ticklabels(['0', '64']) axs[6, 6].set_xlabel('time') axs[6, 6].set_ylabel('mel bins') plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot frame-wise SED fig, ax = plt.subplots(1, 1, figsize=(4, 4)) score_mat = [] for k in range(classes_num): score = np.mean(batch_bottleneck[n, k], axis=-1) score_mat.append(score) score_mat = np.array(score_mat) ax.matshow(score_mat, origin='lower', aspect='auto', cmap='jet') ax.set_title('Frame-wise predictions') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot event-wise SED est_event_list = get_est_event_list(batch_pred_sed[n:n+1], batch_audio_names[n:n+1], labels) event_mat = event_list_to_matrix(est_event_list) fig, ax = plt.subplots(1, 1, figsize=(4, 4)) ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet') ax.set_title('Event-wise predictions') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show() # Plot event-wise ground truth ref_event_list = get_ref_event_list(meta, batch_audio_names[n:n+1]) event_mat = event_list_to_matrix(ref_event_list) fig, ax = plt.subplots(1, 1, figsize=(4, 4)) ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet') ax.set_title('Event-wise ground truth') ax.xaxis.set_ticks([0, 310]) ax.xaxis.set_ticklabels(['0.0', '10.0 s']) ax.xaxis.tick_bottom() ax.set_xlabel('time') ax.yaxis.set_ticks(np.arange(classes_num)) ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small') ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3) plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5) plt.show()
def inference_wiener(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model. target_type = ['speech', 'noise'] model_dict = {} for e in target_type: n_freq = 257 model = DNN(stack_num, n_freq) model_path = os.path.join(workspace, "models", filename, e, "md_%d_iters.tar" % iter) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) # Move model to GPU. if cuda: model.cuda() model.eval() model_dict[e] = model # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(cmplx_sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) # Predict. pred_dict = {} for e in target_type: pred = forward(model_dict[e], x, mean_, std_, cuda) pred = pred.data.cpu().numpy() pred_dict[e] = pred print(cnt, name) # Wiener filter. pred_mag_sp = pred_dict['speech'] / ( pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp) pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp) frames = stft.istft(pred_cmplx_sp) cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(frames, hop_size, cola_constant) seq = seq[0:len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: vmin = -5. vmax = 5. fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(np.log(np.abs(cmplx_sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T, origin='lower', aspect='auto', cmap='jet') plt.show()
def test_invertable(self): x = sawtooth(numpy.linspace(0, 1, 44100) * 2 * numpy.pi * 10) X = stft(x, 1024) xi = istft(X, len(x)) self.assertTrue(len(xi) == len(x)) self.assertTrue(numpy.allclose(xi, x, 1e-01))
def test_istft(self): """Test istft""" x = np.ones(16) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = boxcar original_size = x.shape p = 1 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.random.randn(16) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = boxcar original_size = x.shape p = 1 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.random.randn(16, 2, 2) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = boxcar original_size = x.shape p = 1 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.ones(16) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = hann original_size = x.shape p = 1 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.ones(16) segment_length = 4 shift_length = 4 segment_length_padded = 4 window_function = hann original_size = x.shape p = 2 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.ones(16) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = boxcar original_size = x.shape p = 2 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.ones(16) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = hann original_size = x.shape p = 2 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.random.randn(16) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = hann original_size = x.shape p = 2 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.random.randn(16, 3) segment_length = 4 shift_length = 2 segment_length_padded = 4 window_function = hann original_size = x.shape p = 2 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x)) x = np.ones(16) segment_length = 4 shift_length = 2 segment_length_padded = 7 window_function = boxcar original_size = x.shape p = 1 x_stft, start_list, stop_list = stft(x, segment_length, segment_length_padded, shift_length, window_function) x_out = istft(x_stft, segment_length, segment_length_padded, start_list, stop_list, original_size, window_function, p) self.assertTrue( np.allclose(x_out, x))
def tf_agc(d, sr, t_scale=0.5, f_scale=1.0, causal_tracking=True, plot=False): """ Perform frequency-dependent automatic gain control on an auditory frequency axis. d is the input waveform (at sampling rate sr); y is the output waveform with approximately constant energy in each time-frequency patch. t_scale is the "scale" for smoothing in time (default 0.5 sec). f_scale is the frequency "scale" (default 1.0 "mel"). causal_tracking == 0 selects traditional infinite-attack, exponential release. causal_tracking == 1 selects symmetric, non-causal Gaussian-window smoothing. D returns actual STFT used in analysis. E returns the smoothed amplitude envelope divided out of D to get gain control. """ hop_size = 0.032 # in seconds # Make STFT on ~32 ms grid ftlen = int(2 ** np.round(np.log(hop_size * sr) / np.log(2.))) winlen = ftlen hoplen = winlen / 2 D = stft(d, winlen, hoplen) # using my code ftsr = sr / hoplen ndcols = D.shape[1] # Smooth in frequency on ~ mel resolution # Width of mel filters depends on how many you ask for, # so ask for fewer for larger f_scales nbands = max(10, 20 / f_scale) # 10 bands, or more for very fine f_scale mwidth = f_scale * nbands / 10 # will be 2.0 for small f_scale (f2a_tmp, _) = fft2melmx(ftlen, sr, int(nbands), mwidth) f2a = f2a_tmp[:, :ftlen / 2 + 1] audgram = np.dot(f2a, np.abs(D)) if causal_tracking: # traditional attack/decay smoothing fbg = np.zeros(audgram.shape) # state = zeros(size(audgram,1),1); state = np.zeros(audgram.shape[0]) alpha = np.exp(-(1. / ftsr) / t_scale) for i in range(audgram.shape[1]): state = np.maximum(alpha * state, audgram[:, i]) fbg[:, i] = state else: # noncausal, time-symmetric smoothing # Smooth in time with tapered window of duration ~ t_scale tsd = np.round(t_scale * ftsr) / 2 htlen = 6 * tsd # Go out to 6 sigma twin = np.exp(-0.5 * (((np.arange(-htlen, htlen + 1)) / tsd) ** 2)).T # reflect ends to get smooth stuff AD = audgram x = np.hstack((np.fliplr(AD[:, :htlen]), AD, np.fliplr(AD[:, -htlen:]), np.zeros((AD.shape[0], htlen)))) fbg = signal.lfilter(twin, 1, x, 1) # strip "warm up" points fbg = fbg[:, twin.size + np.arange(ndcols)] # map back to FFT grid, flatten bark loop gain sf2a = np.sum(f2a, 0) sf2a_fix = sf2a sf2a_fix[sf2a == 0] = 1. E = np.dot(np.dot(np.diag(1. / sf2a_fix), f2a.T), fbg) # Remove any zeros in E (shouldn't be any, but who knows?) E[E <= 0] = np.min(E[E > 0]) # invert back to waveform y = istft(D / E, winlen, hoplen, window=np.ones(winlen)) # using my code if plot: try: import matplotlib.pyplot as plt plt.subplot(3, 1, 1) plt.imshow(20. * np.log10(np.flipud(np.abs(D)))) plt.subplot(3, 1, 2) plt.imshow(20. * np.log10(np.flipud(np.abs(E)))) A = stft(y, winlen, hoplen) # using my code plt.subplot(3, 1, 3) plt.imshow(20. * np.log10(np.flipud(np.abs(A)))) plt.show() except Exception, e: print "Failed to plot results" print e
def render_estimate(est, sr, name="test.wav"): y = stft.istft(est, 256) write_mono(y, "result/" + name, sr)
def process(self): if (self.signals is None or len(self.signals) == 0): raise NameError('No signal to beamform') if self.processing is 'FrequencyDomain': # create window function win = np.concatenate((np.zeros(self.zpf), windows.hann(self.L), np.zeros(self.zpb))) # do real STFT of first signal tfd_sig = stft.stft(self.signals[0], self.L, self.hop, zp_back=self.zpb, zp_front=self.zpf, transform=np.fft.rfft, win=win) * np.conj(self.weights[0]) for i in xrange(1, self.M): tfd_sig += stft.stft(self.signals[i], self.L, self.hop, zp_back=self.zpb, zp_front=self.zpf, transform=np.fft.rfft, win=win) * np.conj(self.weights[i]) # now reconstruct the signal output = stft.istft( tfd_sig, self.L, self.hop, zp_back=self.zpb, zp_front=self.zpf, transform=np.fft.irfft) # remove the zero padding from output signal if self.zpb is 0: output = output[self.zpf:] else: output = output[self.zpf:-self.zpb] elif self.processing is 'TimeDomain': # go back to time domain and shift DC to center tw = np.sqrt(self.weights.shape[1])*np.fft.irfft(np.conj(self.weights), axis=1) tw = np.concatenate((tw[:, self.N/2:], tw[:, :self.N/2]), axis=1) from scipy.signal import fftconvolve # do real STFT of first signal output = fftconvolve(tw[0], self.signals[0]) for i in xrange(1, len(self.signals)): output += fftconvolve(tw[i], self.signals[i]) elif self.processing is 'Total': W = np.concatenate((self.weights, np.conj(self.weights[:,-2:0:-1])), axis=1) W[:,0] = np.real(W[:,0]) W[:,self.N/2] = np.real(W[:,self.N/2]) F_sig = np.zeros(self.signals.shape[1], dtype=complex) for i in xrange(self.M): F_sig += np.fft.fft(self.signals[i])*np.conj(W[i,:]) f_sig = np.fft.ifft(F_sig) print np.abs(np.imag(f_sig)).mean() print np.abs(np.real(f_sig)).mean() output = np.real(np.fft.ifft(F_sig)) return output
line + '.wav') clean_audio_2 = clean_audio_2.astype( 'float32') / np.power(2, 15) sr, mix_audio = wav_read(wav_folders + 'mix/' + line + '.wav') mix_audio = mix_audio.astype('float32') / np.power( 2, 15) # Compute time-domain estimated signals RES_1 = est_1r + 1j * est_1i RES_2 = est_2r + 1j * est_2i RES_1 = np.concatenate( (RES_1, np.conj(RES_1[:, ::-1][:, 1:-1])), axis=1) RES_2 = np.concatenate( (RES_2, np.conj(RES_2[:, ::-1][:, 1:-1])), axis=1) res_1 = istft(RES_1, len(clean_audio_1)) res_2 = istft(RES_2, len(clean_audio_2)) res_1 = res_1.astype('float32') res_2 = res_2.astype('float32') # Save mixture, clean signals and estimates in the file folder for evaluation. s_res = np.concatenate( (res_1.reshape(-1, 1), res_2.reshape(-1, 1)), 1) s_c = np.concatenate((clean_audio_1.reshape( -1, 1), clean_audio_2.reshape(-1, 1)), 1) # Pad or crop according to the clean source if s_res.shape[0] > s_c.shape[0]: s_res = s_res[:s_c.shape[0], :] else: s_res = np.concatenate( (s_res,
def process(self, FD=False): if self.signals is None or len(self.signals) == 0: raise NameError('No signal to beamform') if FD is True: # STFT processing if self.weights is None and self.filters is not None: self.weightsFromFilters() elif self.weights is None and self.filters is None: raise NameError('Beamforming weights or filters need to be computed first.') # create window function win = np.concatenate((np.zeros(self.zpf), windows.hann(self.L), np.zeros(self.zpb))) # do real STFT of first signal tfd_sig = stft.stft(self.signals[0], self.L, self.hop, zp_back=self.zpb, zp_front=self.zpf, transform=np.fft.rfft, win=win) * np.conj(self.weights[0]) for i in xrange(1, self.M): tfd_sig += stft.stft(self.signals[i], self.L, self.hop, zp_back=self.zpb, zp_front=self.zpf, transform=np.fft.rfft, win=win) * np.conj(self.weights[i]) # now reconstruct the signal output = stft.istft( tfd_sig, self.L, self.hop, zp_back=self.zpb, zp_front=self.zpf, transform=np.fft.irfft) # remove the zero padding from output signal if self.zpb is 0: output = output[self.zpf:] else: output = output[self.zpf:-self.zpb] else: # TD processing if self.weights is not None and self.filters is None: self.filtersFromWeights() elif self.weights is None and self.filters is None: raise NameError('Beamforming weights or filters need to be computed first.') from scipy.signal import fftconvolve # do real STFT of first signal output = fftconvolve(self.filters[0], self.signals[0]) for i in xrange(1, len(self.signals)): output += fftconvolve(self.filters[i], self.signals[i]) return output