def deconv_a_file(filename): song_id = filename.split('.')[0] path_out_here = path_results + song_id + '/' # path_img_here = path_results + song_id + '_img/' SRC = np.load(path_SRC + filename) if os.path.exists(path_out_here): print '%s might be done already, I skip this.' % song_id print 'remove %s and %s to proceed.' % (path_out_here, path_img_here) if not os.path.exists(path_out_here): os.makedirs(path_out_here) # if not os.path.exists(path_img_here): # os.makedirs(path_img_here) filename_out = '%s_a_original.wav' % (song_id) librosa.output.write_wav(path_out_here + filename_out, librosa.istft(SRC, hop_length=N_FFT/2), sr=SAMPLE_RATE, norm=True) for depth in depths: print '--- deconve! ---' deconvedMASKS = auralise.get_deconve_mask(W[:depth], layer_names, SRC, depth) # size can be smaller than SRC due to downsampling print 'result; %d masks with size of %d, %d' % deconvedMASKS.shape for deconved_feature_ind, deconvedMASK_here in enumerate(deconvedMASKS): MASK = np.zeros(SRC.shape) MASK[0:deconvedMASK_here.shape[0], 0:deconvedMASK_here.shape[1]] = deconvedMASK_here deconvedSRC = np.multiply(SRC, MASK) filename_out = '%s_deconved_from_depth_%d_feature_%d.wav' % (song_id, depth, deconved_feature_ind) librosa.output.write_wav(path_out_here + filename_out, librosa.istft(deconvedSRC, hop_length=N_FFT/2), sr=SAMPLE_RATE, norm=True)
def audio_to_chroma_and_onset_strength(audio, fs = 22050, hop = 512): H,P = librosa.decompose.hpss(librosa.stft(audio)) audio_harmonic = librosa.istft(H) audio_percussive = librosa.istft(P) chroma_gram = librosa.feature.chromagram(audio_harmonic) audio_onset_strength = librosa.onset.onset_strength(audio_percussive, hop_length = hop/4, sr = fs) return chroma_gram, audio_onset_strength
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512): ''' Feature extraction for audio data. Gets a power CQT of harmonic component and onset strength signal of percussive. Input: midi - pretty_midi.PrettyMIDI object fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this Output: audio_gram - CQT of audio data audio_onset_strength - onset strength signal ''' # Use harmonic part for gram, percussive part for onsets H, P = librosa.decompose.hpss(librosa.stft(audio)) audio_harmonic = librosa.istft(H) audio_percussive = librosa.istft(P) # Compute log-frequency spectrogram of original audio audio_gram = np.abs(librosa.cqt(y=audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60))**2 # Beat track the audio file at 4x the hop rate audio_onset_strength = librosa.onset.onset_strength(audio_percussive, hop_length=hop/4, sr=fs) return audio_gram, audio_onset_strength
def extend_dataset(y, sr): #return (y,) # Make 2x faster D = librosa.stft(y, n_fft=2048, hop_length=512) D_fast = librosa.phase_vocoder(D, 2.0, hop_length=512) y_fast = librosa.istft(D_fast, hop_length=512) # Concatenate two 2x frames together y_fast = append(y_fast, y_fast) # Make 2x slower D_slow = librosa.phase_vocoder(D, 0.5, hop_length=512) y_slow = librosa.istft(D_slow, hop_length=512) # split two 0.5x frames together y_slow1, y_slow2 = split(y_slow, 2) ## Frequency scaling #y_pitch_up = librosa.effects.pitch_shift(y, sr, n_steps=4) #y_pitch_down = librosa.effects.pitch_shift(y, sr, n_steps=-4) samples = min([len(y), len(y_fast), len(y_slow1), len(y_slow2)]) y = y[:samples] y_fast = y_fast[:samples] y_slow1 = y_slow1[:samples] y_slow2 = y_slow2[:samples] return (y, y_fast, y_slow1, y_slow2)
def hpss(y): D = librosa.stft(y) H, P = librosa.decompose.hpss(D, kernel_size=KERNEL_SIZE, power=HPSS_P) D_harm = np.abs(librosa.stft(librosa.istft(H), n_fft=N_FFT, hop_length=HOP)) D_perc = np.abs(librosa.stft(librosa.istft(P), n_fft=N_FFT, hop_length=HOP)) return D_harm, D_perc
def test_istft_bad_window(): D = np.zeros((1025, 10), dtype=np.complex64) n_fft = 2 * (D.shape[0] - 1) window = np.ones(n_fft // 2) librosa.istft(D, window=window)
def get_hpss(y, PARAMETERS): '''Separate harmonic and percussive audio time series''' # Get the STFT D = librosa.stft(y, **PARAMETERS['stft']) # Get the HPSS D_h, D_p = librosa.decompose.hpss(D, **PARAMETERS['hpss']) y_h = librosa.istft(D_h, hop_length=PARAMETERS['stft']['hop_length']) y_p = librosa.istft(D_p, hop_length=PARAMETERS['stft']['hop_length']) return y_h, y_p
def __test(infile): DATA = load(infile) Dinv = librosa.istft(DATA['D'], n_fft = DATA['nfft'][0,0].astype(int), hann_w = DATA['hann_w'][0,0].astype(int), hop_length = DATA['hop_length'][0,0].astype(int)) assert numpy.allclose(Dinv, DATA['Dinv'])
def istft(file, stft_mat, frame_length=1024, frame_shift=256, center=False, window="hann", transpose=True, norm=None, fs=16000, nsamps=None): if transpose: stft_mat = np.transpose(stft_mat) samps = audio_lib.istft( stft_mat, frame_shift, frame_length, window=window, center=center, length=nsamps) samps_norm = np.linalg.norm(samps, np.inf) # renorm if needed if not norm: samps = samps * norm / samps_norm samps_int16 = (samps * MAX_INT16).astype(np.int16) fdir = os.path.dirname(file) if fdir and not os.path.exists(fdir): os.makedirs(fdir) audio_lib.output.write_wav(file, samps_int16, fs)
def compute_cqt(filename): a, sr = librosa.load(filename, sr=SR) spectrum = librosa.stft(a) harm_spec, _ = librosa.decompose.hpss(spectrum) harm = librosa.istft(harm_spec) cqt = np.abs(librosa.cqt(harm, sr=sr, hop_length=HOP, real=False)) return cqt
def stretch_demo(input_file, output_file, speed): '''Phase-vocoder time stretch demo function. :parameters: - input_file : str path to input audio - output_file : str path to save output (wav) - speed : float > 0 speed up by this factor ''' N_FFT = 2048 HOP_LENGTH = N_FFT /4 # 1. Load the wav file, resample print 'Loading ', input_file y, sr = librosa.load(input_file) # 2. generate STFT @ 2048 samples print 'Computing short-time fourier transform... ' D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH) print 'Playing back at %3.f%% speed' % (speed * 100) D_stretch = librosa.phase_vocoder(D, speed, hop_length=HOP_LENGTH) y_stretch = librosa.istft(D_stretch, hop_length=HOP_LENGTH) print 'Saving stretched audio to: ', output_file librosa.output.write_wav(output_file, y_stretch, sr)
def mfcc_clustering(file_name, n_clusters): """ From Prem :return: """ clusterer = KMeans(n_clusters=n_clusters) print(file_name) mix, sr = librosa.load(file_name) mix_stft = librosa.stft(mix) comps, acts = find_template(mix_stft, sr, 100, 101, 0, mix_stft.shape[1]) cluster_comps = librosa.feature.mfcc(S=comps)[1:14] save_mfcc_img(file_name[:-4] + "_mfcc.png", np.flipud(cluster_comps)) clusterer.fit_transform(cluster_comps.T) labels = clusterer.labels_ # print(labels) sources = [] for cluster_index in range(n_clusters): indices = np.where(labels == cluster_index)[0] template, residual = extract_template(comps[:, indices], mix_stft) t = librosa.istft(template) sources.append(t) return np.array(sources)
def decompose_into_harmonic_and_percussive(filepath, kernel_size=(7,15), n_fft = 4096, hop_length = 1024): """ Performs Harmonic/Percussive Source Separation on an audio file by applying median filters and returns each filtered version as an audio signal ARGS filepath: fullpath of audio file <str> kernel_size: tuple sized of (harmonic, percussive) filters (<int>,<int>) n_fft: FFT size <int> hop_length : hop length <int> """ signal, sr = load_signal(filepath) D = librosa.stft(signal, n_fft, hop_length) H, P = librosa.decompose.hpss(D, kernel_size=(7,15)) signal_harm = librosa.istft(H) signal_perc = librosa.istft(P) return signal_harm, signal_perc
def find_template(music_stft, sr, min_t, n_components, start, end): """ from Prem :param music_stft: :param sr: :param min_t: :param n_components: :param start: :param end: :return: """ template_stft = music_stft[:, start:end] layer = librosa.istft(template_stft) layer_rms = np.sqrt(np.mean(layer * layer)) comps = [] acts = [] errors = [] for T in range(min_t, n_components): transformer = NMF(n_components=T) comps.append(transformer.fit_transform(np.abs(template_stft))) acts.append(transformer.components_) errors.append(transformer.reconstruction_err_) # knee = np.diff(errors, 2) # knee = knee.argmax() + 2 knee = 0 # print 'Using %d components' % (knee + min_t) return comps[knee], acts[knee]
def percussive(y): '''Extract the percussive component of an audio time series''' D = librosa.stft(y) P = librosa.decompose.hpss(D)[1] return librosa.istft(P)
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512): ''' Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT. Input: midi - pretty_midi.PrettyMIDI object sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512 Output: midi_gram - Simulated CQT of the midi data ''' # Synthesize the MIDI using the supplied sf2 path midi_audio = midi.fluidsynth(fs=fs, sf2_path=sf2_path) # Use the harmonic part of the signal H, P = librosa.decompose.hpss(librosa.stft(midi_audio)) midi_audio_harmonic = librosa.istft(H) # Compute log frequency spectrogram of audio synthesized from MIDI midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60, tuning=0.0))**2 return midi_gram
def reverse_channel(a, b, n_fft=2**13, win_length=2**12, hop_length=2**10): ''' Estimates the channel distortion in b relative to a and reverses it :parameters: - a : np.ndarray Some signal - b : np.ndarray Some other signal with channel distortion relative to a - n_fft : int Number of samples in each FFT computation, default 2**13 - win_length : int Number of samples in each window, default 2**12 - hop_length : int Number of samples between successive FFT computations, default 2**10 :returns: - b_filtered : np.ndarray The signal b, filtered to reduce channel distortion ''' # Compute spectrograms a_spec = librosa.stft(a, n_fft=n_fft, win_length=win_length, hop_length=hop_length) b_spec = librosa.stft(b, n_fft=n_fft, win_length=win_length, hop_length=hop_length) # Compute the best filter H = best_filter_coefficients(a_spec, b_spec) # Apply it in the frequency domain (ignoring aliasing! Yikes) b_spec_filtered = H*b_spec # Get back to time domain b_filtered = librosa.istft(b_spec_filtered, win_length=win_length, hop_length=hop_length) return b_filtered
def render_audio(new_stft, sr, fpath, y_orig, mix=False): assert not np.any(np.isnan(new_stft)) audio = librosa.istft(new_stft, hop_length=HOP) if mix: min_len = np.min([len(audio), len(y_orig)]) audio = audio[:min_len] + y_orig[:min_len] librosa.output.write_wav(fpath, audio, sr)
def get_freq_component(y, k=4): components, activations, phase = decompose(y) D_k = np.multiply.outer(components[:, k], activations[k]) # invert the stft after putting the phase back in y_k = librosa.istft(D_k * phase) return y_k
def reconstruct(components, activations, phase): # Play back the reconstruction # Reconstruct a spectrogram by the outer product of component k and its activation D_k = components.dot(activations) # invert the stft after putting the phase back in y_k = librosa.istft(D_k * phase) return y_k
def mix_by_chromagram(src_path, tgt_paths, n_fft = 4096, hop_length = 1024): print "create source sound" src_sound = Sound(src_path) targets = {} print "create target sounds" if isinstance(tgt_paths, list): for path in tgt_paths: tgt_sound = Sound(tgt_path) targets[path] = tgt_sound else: tgt_sound = Sound(tgt_paths) targets[tgt_paths] = tgt_sound #zeros chromagram zeros = src_sound.getChromagram()[0]*0 #IMPLEMENT cut all arrays such that they have same length! print "create temporary magnitude and phase containers" tmp_mag = None tmp_phase = None ratio = len(src_sound.getSpectra().getMagnitude()) / len(src_sound.getChromagram()[0]) print "block size", ratio #Compute distances print "compute distances" for i in range(len(src_sound.getChromagram()[0]) -1): print "computing frame block", i distance = None closest = zeros for target in targets.values(): try: new_dist = norm(np.transpose(target.getChromagram())[i] - np.transpose(src_sound.getChromagram())[i]) if new_dist < distance or distance == None: distance = new_dist closest = target.spectra except IndexError: print 'IDX Error' try: cap = min(len(closest.getMagnitude(i)) , len(src_sound.spectra.getMagnitude(i))) #Add magnitudes and phases for j in range(ratio): if tmp_mag == None: tmp_mag = src_sound.spectra.getMagnitude(i*ratio + j)[:cap] + closest.getMagnitude(i*ratio+j)[:cap] tmp_phase = src_sound.spectra.getPhase(i*ratio +j)[:cap] + closest.getPhase(i*ratio + j)[:cap] else: tmp_mag = np.vstack((tmp_mag, src_sound.spectra.getMagnitude(i*ratio +j)[:cap] + closest.getMagnitude(i*ratio+j)[:cap])) tmp_phase = np.vstack((tmp_phase, src_sound.spectra.getPhase(i*ratio +j)[:cap] + closest.getPhase(i*ratio + j)[:cap])) except AttributeError: print 'Attribute Error' #Average magnitudes and phases tmp_mag *= 0.5 tmp_phase *= 0.5 signal = librosa.istft(tmp_mag * tmp_phase) librosa.output.write_wav(src_sound.path[:-4]+"-mix.wav", signal, 2*src_sound.sr)
def specs_to_wavs_istft_batch(magnitudes, phases, hop_length): stft_matrices = combine_magnitdue_phase(magnitudes = magnitudes, phases = phases) wavs = list() for magnitude, phase in zip(magnitudes, phases): wav = librosa.istft(stft_matrices, hop_length = hop_length) wavs.append(wav) wavs = np.array(wavs) return wavs
def griffinlim(spectrogram, n_iter=50, window='hann', n_fft=2048, win_length=2048, hop_length=-1, verbose=False): if hop_length == -1: hop_length = n_fft // 4 angles = np.exp(2j * np.pi * np.random.rand(*spectrogram.shape)) t = tqdm(range(n_iter), ncols=100, mininterval=2.0, disable=not verbose) for i in t: full = np.abs(spectrogram).astype(np.complex) * angles inverse = librosa.istft(full, hop_length = hop_length, win_length = win_length, window = window) rebuilt = librosa.stft(inverse, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window = window) angles = np.exp(1j * np.angle(rebuilt)) if verbose: diff = np.abs(spectrogram) - np.abs(rebuilt) t.set_postfix(loss=np.linalg.norm(diff, 'fro')) full = np.abs(spectrogram).astype(np.complex) * angles inverse = librosa.istft(full, hop_length = hop_length, win_length = win_length, window = window) return inverse
def spectrogram2wav(mag, n_fft, win_length, hop_length, num_iters, phase_angle=None, length=None): assert(num_iters > 0) if phase_angle is None: phase_angle = np.pi * np.random.rand(*mag.shape) spec = mag * np.exp(1.j * phase_angle) for i in range(num_iters): wav = librosa.istft(spec, win_length=win_length, hop_length=hop_length, length=length) if i != num_iters - 1: spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length) _, phase = librosa.magphase(spec) phase_angle = np.angle(phase) spec = mag * np.exp(1.j * phase_angle) return wav
def hpss_demo(input_file, output_harmonic, output_percussive): '''HPSS demo function. :parameters: - input_file : str path to input audio - output_harmonic : str path to save output harmonic (wav) - output_percussive : str path to save output harmonic (wav) ''' N_FFT = 2048 HOP_LENGTH = N_FFT /4 # 1. Load the wav file, resample print 'Loading ', input_file y, sr = librosa.load(input_file) # 2. generate STFT @ 2048 samples print 'Computing short-time fourier transform... ' D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH) # 3. HPSS. The default kernel size isn't necessarily optimal, but works okay enough print 'Separating harmonics and percussives... ' harmonic, percussive = librosa.decompose.hpss(D) # 4. Invert STFT print 'Inverting harmonics and percussives... ' y_harmonic = librosa.istft(harmonic, hop_length=HOP_LENGTH) y_percussive = librosa.istft(percussive, hop_length=HOP_LENGTH) # 5. Save the results print 'Saving harmonic audio to: ', output_harmonic librosa.output.write_wav(output_harmonic, y_harmonic, sr) print 'Saving percussive audio to: ', output_percussive librosa.output.write_wav(output_percussive, y_percussive, sr)
def decompose_save(filepath, kernel_size=(5,17), n_fft = 4096, hop_length = 1024): """ Performs Harmonic/Percussive Source Separation on an audio file by applying median filters and saves each filtered file and a mix of them as an audio file. ARGS filepath: fullpath of audio file <str> kernel_size: tuple sized of (harmonic, percussive) filters (<int>,<int>) n_fft: FFT size <int> hop_length : hop length <int> """ signal, sr = load_signal(filepath) D = librosa.stft(signal, n_fft, hop_length) H, P = librosa.decompose.hpss(D, kernel_size=(5,17)) signal_harm = librosa.istft(H) signal_perc = librosa.istft(P) signal_mix = librosa.istft(D) librosa.output.write_wav(filepath[:-4]+"-harm.wav", signal_harm, sr) librosa.output.write_wav(filepath[:-4]+"-perc.wav", signal_perc, sr) librosa.output.write_wav(filepath[:-4]+"-mix.wav", signal_mix, sr)
def extract_all_layers(music_path, parameters=None, n_components = 8, beats = None): music, sr, music_stft = load_file(music_path) if beats == 'quantize': beats = quantize_track(music, sr) original_rms = np.sqrt(np.mean(music*music)) layers = [] boundaries = [] template, residual, errors, beats, inflection_point, beat, template_error, start = get_layer(music, sr, 0, beats=beats, parameters=parameters, n_components=n_components) while True: layers.append(librosa.istft(template)) boundaries.append(beats[beat]) print 'LAYER: ' + str(len(layers)) if np.sqrt(np.mean(music*music)) < original_rms/5: print 'Residual rms too low, terminating' break if beat >= len(beats)-8: print 'Went to end of file, terminating' break music = librosa.istft(residual) start = beat template, residual, errors, beats, inflection_point, beat, template_error, start = get_layer(music, sr, start, beats, parameters=parameters) return layers, boundaries, music_stft
def reconstruct_from_magnitude(self, stft_mag, it=100): n_fft = (stft_mag.shape[0] - 1) * 2 x = np.random.randn((stft_mag.shape[1] - 1) * self.hop_length) for i in range(it): stft_rec = lbr.stft(x, n_fft=n_fft, hop_length=self.hop_length) angle = np.angle(stft_rec) my_stft = stft_mag * np.exp(1.0j * angle) if self.verbose: # and i == it - 1: prev_x = x x = lbr.istft(my_stft, hop_length=self.hop_length) if self.verbose: # and i == it - 1: mse = np.sqrt(np.square(x - prev_x).sum()) # logmse would be more appropriate? print('MSE between sub- and ultimate iteration: {}'.format(mse)) return x
def spec_to_wav_batch(stft_matrices, hop_length = None): # Every stft matrix in stft matrices may have complex numbers assert (stft_matrices.ndim == 3), 'Single stft maxtrix uses librosa.istft() directly' wavs = list() for stft_matrix in stft_matrices: wav = librosa.istft(stft_matrix, hop_length = hop_length) wavs.append(wav) wavs = np.array(wavs) return wavs
def __test(infile): DATA = load(infile) if DATA['hann_w'][0,0] == 0: window = np.ones win_length = 2 * (DATA['D'].shape[0] - 1) else: window = None win_length = DATA['hann_w'][0,0] Dinv = librosa.istft(DATA['D'], hop_length = DATA['hop_length'][0,0].astype(int), win_length = win_length, window = window) assert np.allclose(Dinv, DATA['Dinv'])
def SaveAudio(file_path, mag, phase) : y = librosa.istft(mag*phase,win_length=window_size,hop_length=hop_length) librosa.output.write_wav(file_path,y,SR,norm=True) print(file_path + " Save complete!!")
def ac_tempogram(y: np.ndarray) -> np.ndarray: D = delta_spectral(y)**2 D = librosa.istft(D, win_length=2048, hop_length=2048) return D.reshape(1025, -1)
def decode(self, A): return librosa.istft(randomise_phase(self.comps.dot(A)))
def create_audio_from_spectrogram(spec): spec_transposed = tf.transpose(spec).eval() return librosa.istft(spec_transposed, Config.hop_length)
def _istft(y): return librosa.istft(y, hop_length=get_hop_size())
optimizer=keras.optimizers.Adam(), metrics=['accuracy']) model.fit(noisyInput, cleanOutput, epochs=20, validation_split=0.2) #-------------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------reconstruct-------------------------------------------------- #-------------------------------------------------------------------------------------------------------------------- reconstructed = model.predict(noisyInput) reconstructed = reconstructed.reshape(reconstructed.shape[0] // 626, 626, 155) noisy_phase = noisy_phase.reshape(noisy_phase.shape[0] // 129, 129, 626) #reconstructed=cleanOutput print(reconstructed.shape) for k in range(reconstructed.shape[0]): suma = [] for i in range(reconstructed.shape[1]): for j in range(129): reconstructed[k][i][j] = math.sqrt(math.exp( reconstructed[k][i][j])) suma.append(reconstructed[k][i]) suma = np.array(suma) suma = suma.T the_real_STFT = suma[:-26:] print('the_rere', the_real_STFT.shape, the_real_STFT) the_rec_stft = the_real_STFT * noisy_phase[k] the_rec_signal = librosa.istft(the_rec_stft, hop_length=128) # all_sounds[k]=the_rec_signal scipy.io.wavfile.write('recSignal_{}.wav'.format(k), 8000, the_rec_signal)
grad_values = np.copy(self.grad_values) self.loss_value = None self.grad_values = None return grad_values evaluator = Evaluator() # run scipy-based optimization (L-BFGS) over the pixels of the generated image # so as to minimize the neural style loss x = base_array for i in range(iterations): print('Start of iteration', i) print('sr:') print(base_sr) start_time = time.time() x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(), fprime=evaluator.grads, maxfun=20) print('Current loss value:', min_val) # save current generated image img = deprocess_image(x.copy(), base_phases, img_nrows, img_ncols) out = librosa.istft(img) fname = result_prefix + '_at_iteration_%d.wav' % i pysndfile.sndio.write(fname, out, rate=base_sr, format='wav', enc='pcm16') end_time = time.time() print('Image saved as', fname) print('Iteration %d completed in %ds' % (i, end_time - start_time))
def _istft(self, y): return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length)
def to_wav(mag, phase, len_hop=ModelConfig.L_HOP): stft_matrix = get_stft_matrix(mag, phase) return np.array( list(map(lambda s: librosa.istft(s, hop_length=len_hop), stft_matrix)))
def main(): # ===== Arguments ===== # parser = argparse.ArgumentParser() parser.add_argument("--gpu", "-g", type=int, default=-1, help="specify GPU") parser.add_argument("--model_path", "-m", type=str) parser.add_argument("--units", "-u", type=int, default=5000, help="# of FC units") parser.add_argument("--data_visual", type=str, default=DATA_DIR_VISUAL) parser.add_argument("--data_speech", type=str, default=DATA_DIR_SPEC) parser.add_argument("--result_dir", type=str, default="RESULT/separation/") args = parser.parse_args() # ===== GPU or CPU ===== # if args.gpu >= 0: xp = cuda.cupy cuda.get_device(args.gpu).use() else: xp = np # ===== Load model ===== # print("loading model...") model = Audio_Visual_Net(spec_len=SPEC_LEN, gpu=args.gpu, num_fusion_units=args.units) if args.gpu >= 0: model.to_gpu(args.gpu) if args.model_path.find("snapshot") > -1: chainer.serializers.load_npz(args.model_path, model, path="updater/model:main/") else: chainer.serializers.load_npz(args.model_path, model) # ===== Load test data ===== # print("loading test data...") spec_input = sorted(glob.glob(os.path.join(args.data_speech, "*.npz"))) vis_input = sorted(glob.glob(os.path.join(args.data_visual, "*"))) #assert len(spec_input)==len(vis_input), "# of files are different between faces and audios." l_input = len(spec_input) test = [] spec_input = [ os.path.join(args.data_speech, "{}.npz".format(i)) for i in range(5) ] vis_input = [ os.path.join(args.data_visual, "{}".format(i)) for i in range(5) ] for i in range(5): _num = int(os.path.basename(spec_input[i]).split(".")[0]) _spec_input_mix, _phase = LoadAudio( fname=os.path.join(DATA_DIR_MIX, "{}.wav".format(_num))) _mag = _spec_input_mix.T[np.newaxis, :, :] _phase = _phase.T[np.newaxis, :, :] _vis_input1 = xp.array( pd.read_csv(os.path.join(vis_input[0], "speech1.csv"), header=None)).astype(xp.float32) / 255. _vis_input2 = xp.array( pd.read_csv(os.path.join(vis_input[0], "speech2.csv"), header=None)).astype(xp.float32) / 255. _vis_input1 = _vis_input1.T[:, :, np.newaxis] _vis_input2 = _vis_input2.T[:, :, np.newaxis] test.append((_mag, _vis_input1, _vis_input2, _phase)) # ===== Separate mixed speeches ===== # print("start saparating...") if not os.path.exists(args.result_dir): os.makedirs(args.result_dir) with chainer.using_config("train", False): for i in range(l_input): print("{}/{}".format(i + 1, l_input)) loop = int(math.ceil(test[i][0].shape[1] // SPEC_LEN)) speech1 = [] speech2 = [] phase = xp.array(test[i][3][0, :, :].T) for l in range(loop): # we have to reshape test data because we must add batch size dimension _spec = test[i][0][np.newaxis, :, (SPEC_LEN * l):(SPEC_LEN * (l + 1)), :] _face1 = test[i][1][np.newaxis, :, (FACE_LEN * l):(FACE_LEN * (l + 1)), :] _face2 = test[i][2][np.newaxis, :, (FACE_LEN * l):(FACE_LEN * (l + 1)), :] y = model.separateSpectrogram(spec=_spec, face1=_face1, face2=_face2) y = y.data mask1 = xp.array(y[0, :, :257].T) mask2 = xp.array(y[0, :, 257:].T) _phase = phase[:, (SPEC_LEN * l):(SPEC_LEN * (l + 1))] d1 = chainer.cuda.to_cpu(mask1 * _phase) d2 = chainer.cuda.to_cpu(mask2 * _phase) speech1.append( istft(d1, hop_length=HOP_LEN, win_length=FFT_SIZE)) speech2.append( istft(d2, hop_length=HOP_LEN, win_length=FFT_SIZE)) speech1 = np.concatenate(speech1) speech2 = np.concatenate(speech2) write_wav(path="{}/{}-speech1.wav".format(args.result_dir, i), y=speech1, sr=SR, norm=True) write_wav(path="{}/{}-speech2.wav".format(args.result_dir, i), y=speech2, sr=SR, norm=True) print("done!!")
plt.subplot(3, 1, 2) background = librosa.amplitude_to_db(S_background, ref=np.max) librosa.display.specshow(background, y_axis='log', sr=sr) plt.title('Background') plt.colorbar() plt.subplot(3, 1, 3) foreground = librosa.amplitude_to_db(S_foreground, ref=np.max) librosa.display.specshow(foreground, y_axis='log', x_axis='time', sr=sr) plt.title('Foreground') plt.colorbar() plt.tight_layout() plt.show() full_audio = librosa.istft(S_full) foreground_audio = librosa.istft(S_foreground) background_audio = librosa.istft(S_background) #################################################### # Print out some metadata of the original audio and the 3 derived streams print("sr: {}".format(sr)) print("orig({}) max {} power {}: {}".format(len(source_audio), audioop.max(source_audio, 2), audioop.rms(source_audio, 2), source_audio)) print("full({}) max {} power {}: {}".format(len(full_audio), audioop.max(background_audio, 2), audioop.rms(full_audio, 2), full_audio)) print("foreground({}) max {} power {}: {}".format(
margin_i * (S_full - S_filter), power=power) mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) # Once we have the masks, simply multiply them with the input spectrum # to separate the components S_foreground = mask_v * S_full S_background = mask_i * S_full # get audio from the foreground audio d_foreground = S_foreground * phase y_hat = librosa.istft(d_foreground) librosa.output.write_wav(dest_vocal_filename, y_hat, sr=sr) # get audio from the background audio d_background = S_background * phase y_hat = librosa.istft(d_background) librosa.output.write_wav(dest_bg_filename, y_hat, sr=sr) # sphinx_gallery_thumbnail_number = 2 plt.figure(figsize=(12, 8)) plt.subplot(3, 1, 1) librosa.display.specshow(librosa.amplitude_to_db(S_full[:, idx], ref=np.max), y_axis='log', sr=sr) plt.title('Full spectrum')
def core(input_path, output_path, output_sr=48000, inter_sr=1, test_mode=False, opti_mode=True, dyn_protect=True, harmonic_hpfc=6000, harmonic_sft=16000, harmonic_gain=1.2, percussive_hpfc=6000, percussive_stf=16000, percussive_gain=2.5, update=None, msgbox=None): def hpd_n_shift(data, lpf, sft, gain): sr = output_sr * inter_sr # 高通滤波 b, a = signal.butter(3, lpf / (sr / 2), 'high') data = librosa.stft(signal.filtfilt(b, a, librosa.istft(data))) # 拷贝频谱 for i in range(data.shape[1]): update.emit(i / data.shape[1]) shift = sft shift_point = round(shift / (sr / data.shape[0])) # 调制 for p in reversed(range(len(chan[:, i]))): data[:, i][p] = data[:, i][p - shift_point] # 高通滤波 data = librosa.stft(signal.filtfilt(b, a, librosa.istft(data))) data *= gain return data # Dyn Protect Tips if dyn_protect: msgbox.emit("提示", "动态范围保护特性已启用\n", 1) # 加载音频 y, sr = librosa.load(input_path, mono=False, sr=None) if test_mode: y, sr = librosa.load(input_path, mono=False, sr=None, offset=round(len(y[0]) / sr / 2), duration=5) y = resampy.resample(y, sr, output_sr * inter_sr, filter='kaiser_fast') # 产生 STFT 谱 stft_list = [librosa.stft(chan) for chan in y] # 谐波增强模式 for chan in stft_list: D_harmonic, D_percussive = librosa.decompose.hpss(chan, margin=4) D_harmonic = hpd_n_shift(D_harmonic, harmonic_hpfc, harmonic_sft, harmonic_gain) D_percussive = hpd_n_shift(D_percussive, percussive_hpfc, percussive_stf, percussive_gain) if not dyn_protect: chan += D_harmonic + D_percussive else: # 动态范围保护 adp = D_harmonic + D_percussive adp_power = np.mean(np.abs(adp)) src_power = np.mean(np.abs(chan)) src_f = 1 - (adp_power / src_power) adp += src_f * chan chan *= 0 chan += adp # 合并输出 istft_list = [librosa.istft(chan) for chan in stft_list] final_data = resampy.resample(np.array(istft_list), output_sr * inter_sr, output_sr, filter='kaiser_fast') try: librosa.output.write_wav(output_path, final_data, output_sr) except PermissionError: msgbox.emit("警告", "无法写入文件,请检查目标路径写入权限" \ "以及文件是否已被其他程序开启。", 0) # 参数优化 if not opti_mode: return optimizer(input_path, output_path, percussive_hpfc, percussive_stf, percussive_gain, msgbox)
noisy_speech_time, sr = librosa.load(NOISY_PATH, sr=None) noisy_speech = 10 * np.log10( np.abs( librosa.stft( noisy_speech_time, n_fft=512, hop_length=160, win_length=320))) noisy_speech = normalize(noisy_speech, test_min, test_max) noisy_phase = librosa.stft(noisy_speech_time, n_fft=512, hop_length=160, win_length=320) print('Saving predicted....') predicted = model.predict(noisy_speech.T) librosa.output.write_wav( '/N/u/anakuzne/Carbonate/dl_for_speech/HW3_II/py/models/normIRM_predicted.wav', librosa.istft(predicted.T * noisy_speech + np.angle(noisy_phase)), sr, norm=False) print('Saving figure...') fig1 = plt.figure(figsize=(10, 5)) plt.imshow(np.abs(predicted), aspect="auto", origin="lowest", extent=[0, 311, 0, 8000]) plt.xlabel("No. of samples") plt.ylabel("Frequency") plt.title("normIRM") plt.savefig( '/N/u/anakuzne/Carbonate/dl_for_speech/HW3_II/py/models/norm_irm_predicted.png',
import scipy.io.wavfile as wave import numpy as np import librosa (rate, x) = wave.read('spring_16k.wav') print np.sum(x) x = x.T print np.sum(librosa.istft(librosa.stft(x), dtype=np.int32))
def to_wav_from_spec(stft_maxrix, len_hop=ModelConfig.L_HOP): return np.array( list(map(lambda s: librosa.istft(s, hop_length=len_hop), stft_maxrix)))
data = np.load('te_data_and_lable.npz') test_data = data['a'] test_data = test_data / np.max(np.abs(test_data)) phase_of_test_data = np.angle(test_data) #test_lable=data['d'] #phase_of_test_data=np.angle(test_lable) model = load_model('my_modle.h5') model.load_weights('./best_weights.hdf5') estimated_magnitude = model.predict(np.abs(test_data)) #estimated_magnitude= np.abs(test_lable) #phase=np.angle(test_data) pre = estimated_magnitude * np.exp(1j * phase_of_test_data) estimate = np.reshape(pre, (pre.shape[0] * pre.shape[1], pre.shape[2])) #estimate1=np.reshape(pre,(pre.shape[2],pre.shape[0]*pre.shape[1])) estimate = librosa.istft(estimate.T, hop_length=512) #sd.play(estimate) te_lable = te_lable[:len(estimate)] te_data = te_data[:len(estimate)] groundtruth = np.zeros((2, len(estimate))) groundtruth[0, :] = te_lable groundtruth[1, :] = te_data - te_lable estim = np.zeros((2, len(estimate))) estim[0, :] = estimate estim[1, :] = te_data - estimate (sdr, sir, sar, perm) = separation.bss_eval_sources(groundtruth, estim) print("sdr={},sar={}".format(sdr, sar)) librosa.output.write_wav('estimate_test_data.wav', estimate, 44100)
import librosa.display import scipy from scipy import signal import matplotlib.pyplot as plt import wave import struct import os def show_spect(spect, fs, file): librosa.display.specshow(spect, sr=fs) plt.savefig(file.split('.')[0] + '.png') file = "test.npy" spect = np.load(file) spect = np.reshape(spect, (257, 301)) show_spect(spect, 16000, file) #griffin-lim法の実装 A = librosa.db_to_amplitude(spect) theta = 0 X = A * np.cos(theta) + A * np.sin(theta) * 1j for i in range(100): x = librosa.istft(X, hop_length=160, win_length=400) X = librosa.stft(x, n_fft=512, hop_length=160, win_length=400) X = A * X / np.abs(X) librosa.output.write_wav(file.split('.')[0] + '-reconstruct.wav', x, 16000)
def _istft(self, y): _, hop_length, win_length = self._stft_parameters() return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def _istft(y, n_fft, hop_length, win_length, use_tensorflow=False): if use_tensorflow: # return librosa.istft(y, hop_length, win_length) return _istft_tensorflow(y.T, n_fft, hop_length, win_length) else: return librosa.istft(y, hop_length, win_length)
def invert_spectrogram(spectrogram): ''' spectrogram: [f, t] ''' return librosa.istft(spectrogram, 160, win_length=320, window="hamming")
def _istft_librosa(y, hop_length, win_length): return librosa.istft(y, hop_length, win_length)
def _istft(y, hparams): return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
def hp_sep(y): D_h, D_p = librosa.decompose.hpss(librosa.stft(y)) return librosa.istft(D_h), librosa.istft(D_p)
def reconstruct_wave(magnitude, phase): reconstr = librosa.istft(magnitude * phase) return reconstr
def phase_restore(mag, random_phases, N): p = np.exp(1j * (random_phases)) for i in range(N): _, p = librosa.magphase( librosa.stft(librosa.istft(mag * p), n_fft=config.N_FFT)) return p
def invert_spectrogram(spectrogram): '''Applies inverse fft. Args: spectrogram: [1+n_fft//2, t] ''' return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
def _istft(y, sr): _, hop_length, win_length = _stft_parameters(sr) return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def istft_transform_clean(teX, IBM): clean = librosa.istft(teX * IBM, hop_length=512) return clean
def get_istft(X, time_shape, hop_length=256, **kwargs): return librosa.istft(X, hop_length=hop_length, length=time_shape, **kwargs)