def encode(data_file, output_file, key_file=None): print '* * encoding message in audio file...' data_file_size = os.path.getsize(data_file) if key_file is not None: signal, sr = librosa.load(key_file, sr=RATE) spec = stft(signal, WINDOW_LENGTH, HOP_SIZE) else: signal = make_sinewave(1, math.ceil(data_file_size / 20.), RATE) spec = stft(signal, WINDOW_LENGTH, HOP_SIZE) print 'data file size:', data_file_size print 'spec shape', spec.shape with open(data_file) as dfile: d = dfile.read(1) i = 0 while d: h = int(d.encode("hex"), 16) if key_file is not None: spec[h][i] = np.max( np.abs([spec[x][i] for x in range(spec.shape[0])])) + 200 else: spec[h][i] = np.max( np.abs([spec[x][i] for x in range(spec.shape[0])])) * 200 spec[h - 1][i] = 0 spec[h + 1][i] = 0 d = dfile.read(1) i += 1 spec = spec[:, :i] spec = add_start_stop(spec) wavwrite(output_file, istft(spec, 1024, 2048), RATE)
def audio2spec(audio_list, window_size, window_overlap, n_fft): """ Args: audio_list: a numpy array of audio samples, with dimensions (naudio, nsample) window_size: the size of the stft window, in samples window_overlap: amount of window overlap, in samples n_fft: size of windowed signal after zero padding Returns: spec_tens: np array of spectrograms, shape=(naudio, 1 + nfft//2, 1 + nsamples // window_overlap) """ naudio, nsamples = audio_list.shape spec_tens = np.zeros(shape=(naudio, 1 + n_fft // 2, 1 + nsamples // window_overlap)) for idx, audio in enumerate(tqdm(audio_list)): stf = lc.stft(audio, win_length=window_size, hop_length=window_overlap, n_fft=n_fft) spec_tens[idx] = np.abs( lc.stft(audio, win_length=window_size, hop_length=window_overlap, n_fft=n_fft)) return spec_tens
def main(argv): os.makedirs(FLAGS.output_dir, exist_ok=True) ''' Initialize model ''' unet = Unet() restore(net=unet, ckpt_path=FLAGS.ckpt_path) ''' Load data ''' mix_wav, _ = load(FLAGS.original_wav, sr=SAMPLE_RATE) mix_wav_mag, mix_wav_phase = magphase(stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) mix_wav_mag= mix_wav_mag[:, START:END] mix_wav_phase= mix_wav_phase[:, START:END] '''Load gt ''' if FLAGS.gt == True: gt_wav, _ = load(FLAGS.original_gt, sr=SAMPLE_RATE) gt_wav_mag, gt_wav_phase = magphase(stft(gt_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) gt_wav_mag= gt_wav_mag[:, START:END] gt_wav_phase= gt_wav_phase[:, START:END] '''Save input spectrogram image and gt''' write_wav(FLAGS.output_dir+'original_mix.wav', istft(mix_wav_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'original_mix.wav',0) if FLAGS.gt == True: write_wav(FLAGS.output_dir+'gt.wav', istft(gt_wav_mag * gt_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'gt.wav',0) ''' run data ''' inputs = mix_wav_mag[1:].reshape(1, 512, 128, 1) mask = unet(inputs).numpy().reshape(512, 128) predict = inputs.reshape(512, 128)*mask ''' evaluation metrics ''' if FLAGS.gt == True: expand_pre = np.expand_dims(predict.flatten(), axis=0) expand_gt = np.expand_dims(gt_wav_mag[1:].flatten(), axis=0) expand_input = np.expand_dims(inputs.flatten(), axis=0) (SDR, SIR, SAR, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_pre) (SDR2, _, _, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_input) NSDR = SDR - SDR2 #SDR(Se, Sr) − SDR(Sm, Sr) fout = open(FLAGS.output_dir+'metrics.txt','a') print('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****') fout.write('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****') fout.close() ''' Convert model output to target magnitude ''' target_pred_mag = np.vstack((np.zeros((128)), predict)) ''' Write vocal prediction audio files ''' write_wav(FLAGS.output_dir+'pred_vocal.wav', istft(target_pred_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'pred_vocal.wav',1)
def gl_rec(mag_stft, hop, wlen, init_rec, n_iter=40): # Function for Griffin-Lim reconstruction rec = 1.0 * init_rec rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen) angles = rec_stft / np.abs(rec_stft) for i in range(n_iter): rec = core.istft(np.abs(mag_stft**1.2) * angles, hop, wlen) rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen) angles = rec_stft / np.abs(rec_stft) return rec
def comp_lsd(ref_file, pred_file): ref = core.load(ref_file, sr=sr)[0] pred = core.load(pred_file, sr=sr)[0] stft_ref = np.abs( core.stft(ref, n_fft=nfft, hop_length=hop, win_length=wlen)) stft_pred = np.abs( core.stft(pred, n_fft=nfft, hop_length=hop, win_length=wlen)) logstft_ref = np.log(0.1 + stft_ref) logstft_pred = np.log(0.1 + stft_pred[:, :stft_ref.shape[1]]) lsd = np.mean( np.sqrt(np.sum((logstft_ref[7:220] - logstft_pred[7:220])**2, axis=0))) return lsd
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100): """extract features and save""" y_mix = resample(y_mix, original_sr, C.SR) y_vocal = resample(y_vocal, original_sr, C.SR) y_inst = resample(y_inst, original_sr, C.SR) S_mix = np.abs(stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_vocal = np.abs(stft(y_vocal, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_inst = np.abs(stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) norm = S_mix.max() S_mix /= norm S_vocal /= norm S_inst /= norm # np.savez(os.path.join(C.PATH_FFT, fname+".npz"), mix=S_mix, vocal=S_vocal, inst=S_inst) # Generate sequence (1,512,128) and save cnt = 1 i = 0 while i + C.PATCH_LENGTH < S_mix.shape[1]: mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) mix_spec[0, :, :] = S_mix[1:, i:i + C.PATCH_LENGTH] #vocal_spec[0, :, :] = S_vocal[1:, i:i + C.PATCH_LENGTH] inst_spec[0, :, :] = S_inst[1:, i:i + C.PATCH_LENGTH] np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"), data=mix_spec, label=inst_spec) i += C.PATCH_LENGTH cnt += 1 if S_mix.shape[1] >= 128: mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32) mix_spec[0, :, :] = S_mix[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]] #vocal_spec[0, :, :] = S_vocal[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]] inst_spec[0, :, :] = S_inst[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]] np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"), data=mix_spec, label=inst_spec) cnt += 1
def dual_stft(signal_0, signal_1, window_size, hop_percentage): hop_length = int(hop_percentage * window_size / 100) Zxx_0 = lc.stft(signal_0, n_fft=window_size, hop_length=hop_length) Zxx_1 = lc.stft(signal_1, n_fft=window_size, hop_length=hop_length) n_frames = Zxx_0.shape[1] n_freqs = Zxx_0.shape[0] print('Number of frames: {}'.format(n_frames)) print('Frequency resolution: {}'.format(n_freqs)) return Zxx_0, Zxx_1, n_freqs, n_frames, hop_length
def pre_sff(self): feature_path = os.path.join(self.dataset['feature_path'], 'pre_sff') if not os.path.exists(feature_path): os.mkdir(feature_path) x_train = [] y_train = [] f_train = [] for i, row in self.dataset.train_data.iterrows(): print('[Train] {}) Getting pre_sff from {}...'.format( i, row['cur_name']), end='') wav_name = os.path.join(self.dataset['data_path'], row['cur_name']) sr, wav_data = wavfile.read(wav_name) spec = stft(buf_to_float(wav_data), n_fft=800, hop_length=160, win_length=320)[:200, :] # до 4К KHz spec = np.log(np.abs(spec) + 1e-10) spec -= np.min(spec) x_train.append(spec) y_train.append(self._build_multilabel(row)) f_train.append(row['cur_name']) print('done.') x_test = [] y_test = [] f_test = [] for i, row in self.dataset.test_data.iterrows(): print('[Test] {}) Getting sff from {}...'.format( i, row['cur_name']), end='') wav_name = os.path.join(self.dataset['data_path'], row['cur_name']) sr, wav_data = wavfile.read(wav_name) spec = stft(buf_to_float(wav_data), n_fft=800, hop_length=160, win_length=320)[:200, :] # до 4К KHz spec = np.log(np.abs(spec) + 1e-10) spec -= np.min(spec) x_test.append(spec) y_test.append(self._build_multilabel(row)) f_test.append(row['cur_name']) print('done.') self._save_pickles(feature_path, x_train, y_train, f_train, x_test, y_test, f_test)
def Savespec(y_mix, y_inst, fname): S_mix = np.abs( stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_inst = np.abs( stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_vocal = np.maximum(0, S_mix - S_inst) # y_vocal = istft(S_vocal*phase, hop_length=C.H, win_length=C.FFT_SIZE) # write_wav(os.path.join("Audiocheck", fname+".wav"), y_vocal, C.SR) norm = S_mix.max() S_mix /= norm S_inst /= norm S_vocal /= norm np.savez(os.path.join(C.PATH_FFT, fname+".npz"), vocal=S_vocal, mix=S_mix, inst=S_inst)
def phase_MISI(inst_esti, vocal_esti, mix): delta = mix - (inst_esti + vocal_esti) inst = inst_esti + delta / 2 vocal = vocal_esti + delta / 2 S_inst = stft(inst, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) S_vocal = stft(vocal, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) P_inst = np.exp(1.j * np.angle(S_inst)) P_vocal = np.exp(1.j * np.angle(S_vocal)) return P_inst, P_vocal
def compute_features(data_loc='../data/genres/'): file_names = glob.glob(data_loc + '*/*.au') file_names.sort() assert len( file_names ) == 1000, "ERROR: Couldn't read files properly. Is your data_loc correct?" # Setup some vars sampleRate = 22050 n_fft = 1024 X = [] genres_list = list(song_labels_dic.keys()) genres_list.sort() genre_flag = 0 if not os.path.exists('../ckpt'): os.makedirs('../ckpt') for file in file_names: song, _ = lc.load(file) song_dft = np.abs(lc.stft(song, n_fft=n_fft)) X.append(song_dft) if len(X) == 100: print('Writing: ' + genres_list[genre_flag] + '.pkl file...') with open('../ckpt/' + genres_list[genre_flag] + '.pkl', 'wb') as f: pickle.dump(X, f) X = [] genre_flag = genre_flag + 1 return True
def psd(audio, preprocess=False): audioSTFT = stft(audio, n_fft=512, hop_length=128, win_length=512)[:-1, :] Mag, Phase = np.abs(audioSTFT), np.angle(audioSTFT) nframes = int(256 * np.ceil(np.shape(Mag)[1] / 256)) pad_size = nframes - np.shape(Mag)[1] variance = (np.mean(Mag[:10]) if np.mean(Mag[:10]) < 0.01 else 0.01) pad_seq = variance * np.random.randn(256, pad_size) Mag = np.hstack((Mag, pad_seq)) Phase = np.hstack((Phase, 0.0 * pad_seq)) if preprocess: Mag_smooth = mag2dB(norm(optimal_smoothing(Mag))) Mag_smooth[Mag_smooth < -120] = -120 minmax_smooth = [np.min(Mag_smooth), np.max(Mag_smooth)] Mag_smooth_norm = np.interp(Mag_smooth, minmax_smooth, [-1, 1]) Mag = mag2dB(norm(Mag)) Mag[Mag < -120] = -120 minmax = [np.min(Mag), np.max(Mag)] Mag_norm = np.interp(Mag, minmax, [-1, 1]) psd = {} if preprocess: psd['MagdB_smooth'] = Mag_smooth_norm psd['Norm_smooth'] = minmax_smooth psd['MagdB'] = Mag_norm psd['Phase'] = Phase psd['Norm'] = minmax return psd
def librosaSpec(data): from librosa.core import resample, stft from librosa import amplitude_to_db, magphase spectrum = stft(data) mag, ph = magphase(spectrum) return amplitude_to_db(mag), np.angle(ph)
def audio_to_array(audio): #extract audio data and sampling rate from file data, fs = sf.read(audio) #convert to wav file at correct sampling rate sf.write(audio, data, fs) #read the audio sample audio = read(audio) #[removed] #y, sr = load(audio, offset=30, duration=5) #audio_arr = mfcc(y=y, sr=sr) #convert the audio to an array audio_arr = np.array(audio[1],dtype=float) #normalize audio_arr = normalize(audio_arr, np.inf, 0) #short-time Fourier transform audio_arr = np.abs(stft(audio_arr)) #[removed] #Mel - frequency cepstral coefficients(MFCCs) #audio_arr = np.abs(mfcc(audio_arr)) #audio_arr = mfcc(audio_arr, sr=44100) #reduce number of dimensions pca = PCA(n_components=5) audio_arr = pca.fit_transform(audio_arr) return audio_arr
def find_peaks(y,size): sgram = np.abs(stft(y,n_fft=512,hop_length=256)) #sgram = np.log(np.maximum(sgram,np.max(sgram)/1e6)) #sgram = sgram - np.mean(sgram) sgram_max = ndi.maximum_filter(sgram,size=size,mode="constant") maxima = (sgram==sgram_max) & (sgram > 0.2) return maxima
def pncc(audio_wave, n_fft=1024, sr=16000, window="hamming", n_mels=40, n_pncc=13, weight_N=4, power=2, dct=True): pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave) stft_pre_emphasis_signal = np.abs( stft(pre_emphasis_signal, n_fft=n_fft, window=window))**power mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power power_stft_pre_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T) q_ = medium_time_power_calculation(power_stft_pre_signal) q_le = asymmetric_lawpass_filtering(q_, 0.999, 0.5) pre_q_0 = q_ - q_le q_0 = halfwave_rectification(pre_q_0) q_f = asymmetric_lawpass_filtering(q_0) q_th = temporal_masking(q_0) r_sp = after_temporal_masking(q_th, q_f) r_ = switch_excitation_or_non_excitation(r_sp=r_sp, q_f=q_f, q_le=q_le, q_power_stft_pre_signal=q_) s_ = weight_smoothing(r_=r_, q_=q_, N=weight_N) t_ = time_frequency_normalization(p_=power_stft_pre_signal, s_=s_) u_ = mean_power_normalization(t_, r_) v_ = power_function_nonlinearity(u_) dct_v = np.dot(filters.dct(n_pncc, v_.shape[1]), v_.T) if dct: return dct_v.T else: return v_.T
def time_stretch_hpss(audio, f): if f == 1.0: return audio stft = core.stft(audio) # Perform HPSS stft_harm, stft_perc = decompose.hpss( stft, kernel_size=31) # original kernel size 31 # OLA the percussive part y_perc = librosa.util.fix_length(core.istft(stft_perc, dtype=audio.dtype), len(audio)) y_perc = time_stretch_sola(y_perc, f) #~ # Phase-vocode the harmonic part #~ stft_stretch = core.phase_vocoder(stft_harm, 1.0/f) #~ # Inverse STFT of harmonic #~ y_harm = librosa.util.fix_length(core.istft(stft_stretch, dtype=y_perc.dtype), len(y_perc)) y_harm = librosa.util.fix_length(core.istft(stft_harm, dtype=audio.dtype), len(audio)) y_harm = librosa.util.fix_length( time_stretch_sola(core.istft(stft_harm, dtype=audio.dtype), f, wsola=True), len(y_perc)) # Add them together return y_harm + y_perc
def LoadAudio(fname): y, sr = load(fname, sr=C.SR) spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j*np.angle(spec)) return mag, phase
def LoadAudio(fname): y, sr = load(fname, sr=C.SR) spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j * np.angle(spec)) return mag, phase
def __call__(self, data): s = stft(data, n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length, window=self.window) s = np.abs(s) if self.is_abs is True else s return s
def load_audio(fname): y = load(fname, sr=16000)[0] spec = stft(y, n_fft=1024, hop_length=512, win_length=1024) spec = np.pad(spec, [(0, 0), (0, 1024 - spec.shape[1] % 1024)], 'constant') mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j * np.angle(spec)) return mag, phase, y.shape[0]
def audio_to_spectrogram(input_signal, n_fft, hop_length, win_length, window='hann', center=True): return (audio.stft(np.asarray(input_signal), n_fft, hop_length, win_length, window, center))
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512): if os.path.isdir( PATH_INPUT): # 入力がディレクトリーの場合、ファイルリストをつくる filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True) else: # 入力が単一ファイルの場合 filelist_mixdown=[PATH_INPUT] print ('number of mixdown file', len(filelist_mixdown)) # 出力用のディレクトリーがない場合は 作成する。 _, path_output_ext = os.path.splitext(PATH_OUTPUT) print ('path_output_ext',path_output_ext) if len(path_output_ext)==0 and not os.path.exists(PATH_OUTPUT): os.mkdir(PATH_OUTPUT) # モデルの読み込み unet = train.UNet() chainer.serializers.load_npz( MODEL,unet) config.train = False config.enable_backprop = False # ミックスされたものを読み込み、vocal(speech)の分離を試みる for fmixdown in filelist_mixdown: # audioread でエラーが発生した場合は、scipyを使う。 try: y_mixdown, _ = load(fmixdown, sr=SR, mono=True) except: sr_mixdown, y_mixdown = read(fmixdown) if not sr_mixdown == SR: y_mixdown = resample(y_mixdown, sr_mixdown, SR) # 入力の短時間スペクトラムを計算して、正規化する。 spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j*np.angle(spec)) print ('mag.shape', mag.shape) start = 0 end = 128 * (mag.shape[1] // 128) # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。 # speech(vocal)を分離するためのマスクを求める mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :] mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask)) # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。 mag2=mag[:, start:end]*mask phase2=phase[:, start:end] y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE) # 分離した speech(vocal)を出力ファイルとして保存する。 if len(path_output_ext)==0: # ディレクトリーへ出力 foutname, _ = os.path.splitext( os.path.basename(fmixdown) ) fname= os.path.join(PATH_OUTPUT, (foutname + '.wav')) else: # 指定されたファイルへ出力 fname= PATH_OUTPUT print ('saving... ', fname) write_wav(fname, y, SR, norm=True)
def pncc(audio_wave, n_fft=512, sr=16000, winlen=0.020, winstep=0.010, n_mels=128, n_pncc=13, weight_N=4, power=2): pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave) mono_wave = to_mono(pre_emphasis_signal.T) stft_pre_emphasis_signal = np.abs( stft(mono_wave, n_fft=n_fft, hop_length=int(sr * winstep), win_length=int(sr * winlen), window=np.ones(int(sr * winlen)), center=False))**power mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power power_stft_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T) medium_time_power = medium_time_power_calculation(power_stft_signal) lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999, 0.5) subtracted_lower_envelope = medium_time_power - lower_envelope rectified_signal = halfwave_rectification(subtracted_lower_envelope) floor_level = asymmetric_lawpass_filtering(rectified_signal) temporal_masked_signal = temporal_masking(rectified_signal) final_output = switch_excitation_or_non_excitation(temporal_masked_signal, floor_level, lower_envelope, medium_time_power) spectral_weight_smoothing = weight_smoothing(final_output, medium_time_power, L=n_mels) transfer_function = time_frequency_normalization( power_stft_signal, spectral_weight_smoothing) normalized_power = mean_power_normalization(transfer_function, final_output, L=n_mels) power_law_nonlinearity = power_function_nonlinearity(normalized_power) dct = np.dot(power_law_nonlinearity, filters.dct(n_pncc, power_law_nonlinearity.shape[1]).T) return dct
def find_peaks(data): sgram = np.abs(stft(data, n_fft=512, window='hamming')) neighborhood = sp.ndimage.morphology.iterate_structure( sp.ndimage.morphology.generate_binary_structure(2, 1), 8) sgram_max = sp.ndimage.maximum_filter(sgram, footprint=neighborhood, mode='constant') # => (peaks_freq, peaks_time) return np.asarray((sgram == sgram_max) & (sgram > 0.2)).nonzero()
def to_stft(seq, nfft): """ :param seq: Raw audio :param nfft: parameter of STFT :return: STFT of the input seq, broken down into magnitude in one channel and phase in the other. """ nfft_padlen = int(len(seq) + nfft / 2) stft = lc.stft(fix_length(seq, nfft_padlen), n_fft=nfft) return np.array([np.abs(stft), np.angle(stft)]).transpose(1, 2, 0)
def __call__(self, x): if isinstance(x, torch.Tensor): x = x.numpy() X = stft(x, n_fft=self.n_fft) X_mag = np.abs(X)[:, :, None] if self.logpower: X_mag = np.log((X_mag ** 2)) X_pha = np.angle(X)[:, :, None] return np.concatenate((X_mag, X_pha), axis=2)
def __magphase(y, n_fft, hop_length, win_length): spec = stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=C.WINDOW) mag = np.abs(spec).astype(np.float32) mag /= np.max(mag) phase = np.exp(1.j * np.angle(spec)) return mag, phase
def GetMag(sig, rate, winlen, winstep, NFFT, fuc_name='Rect'): '''获取输入音频的频谱图''' mag = stft(np.asfortranarray(sig), n_fft=NFFT, hop_length=int(winstep * rate), win_length=int(winlen * rate), window=fuc_name) # 习惯上我们将频谱值表现在y轴上,故旋转 return mag
def make_mag_spec(filelist, args): batch_length = args.batch_length for filename in filelist: basename = os.path.splitext(os.path.basename(filename))[0] # load wav wav = load(filename, args.fs, mono=False)[0] vocal_wav = wav[0].copy() mix_wav = wav[1].copy() # make magnitude spectrogram vocal_spec = stft(vocal_wav, args.frame_size, args.shift_size) mix_spec = stft(mix_wav, args.frame_size, args.shift_size) spec = np.stack((vocal_spec, mix_spec)) mag_spec = np.abs(spec[:, 1:, :]).copy() for seg in range(mag_spec.shape[-1] // args.batch_length): seg_filename = basename + '_seg{}.npy'.format(seg) seg_mag_spec = \ mag_spec[..., seg * batch_length:(seg + 1) * batch_length] np.save(os.path.join(args.dst_dir, seg_filename), seg_mag_spec)
def process(self, data): """ Returns 3-d matrix of sizes [257,301,2] :param data: :return: """ spectr = stft(data, n_fft=512, hop_length=160) return np.concatenate( (spectr.real[:, :, np.newaxis], spectr.imag[:, :, np.newaxis]), axis=2)
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100): y_mix = resample(y_mix, original_sr, C.SR) y_vocal = resample(y_vocal, original_sr, C.SR) y_inst = resample(y_inst, original_sr, C.SR) S_mix = np.abs( stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_vocal = np.abs( stft(y_vocal, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) S_inst = np.abs( stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32) norm = S_mix.max() S_mix /= norm S_vocal /= norm S_inst /= norm np.savez(os.path.join(C.PATH_FFT, fname+".npz"), mix=S_mix, vocal=S_vocal, inst=S_inst)
# -*- coding: utf-8 -*- """ Created on Sat May 7 13:51:42 2016 @author: parallels """ import numpy as np from librosa.core import load,stft import matplotlib.pyplot as plt from librosa.display import specshow import functions #from scipy.spatial.distance import euclidean y,sr = load("wiwym.wav") rec,sr = load("recording.wav") y = y[:sr*30] spec = np.abs(stft(y,n_fft = 4960,hop_length = 512)) query = np.abs(stft(rec,n_fft = 4960,hop_length = 512)) maximum_spec = find_peak(spec,30) maximum_query = find_peak(query,30) plt.plot(overlap)
def plotStructure(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, window=8, step_size=2, feature='chroma', dim_red='SVD', as_diff=0, round_to=0, normalize=1, scale=1, medfil_len=0): print 'Analyzing {}'.format(fullpath) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) feats = {} feats[feature], beat_times = extractFeature( filename, file_ext, feature, scale, round_to, normalize) # apply low-pass filter and running mean on featgram feats['LPF'] = lpf(feats[feature], cutoff, sr, order) # perform dimensionality reduction (NMF or SVD) if dim_red == 'NMF': print '\tNon-Negative Matrix Factorization for {}'.format(feature) feats['NMF'] = NMF(n_singv).fit_transform(feats[feature].astype(float)) feats['NMF(LPF)'] = NMF(n_singv).fit_transform(feats['LPF']) feats['LPF(NMF)'] = lpf(feats['NMF'], cutoff, sr, order) elif dim_red == 'NMF': print '\tSingular Vector Decomposition' feats['SVD'] = svd(feats[feature], n_singv, inc_proj=False) feats['SVD(LPF)'] = svd(feats['LPF'], n_singv, inc_proj=False) feats['LPF(SVD)'] = lpf(feats['SVD'], cutoff, sr, order) else: raise Exception( "{} is not a supported dimensionality reduction".format(dim_red)) if round_to: feats['LPF'] = np.round( lpf(feat[feature], cutoff, sr, order) / round_to) * round_to feats[dim_red] = np.round( dim_red_fn(dim_ref, feats[feature], n_singv) / round_to) * round_to feats['{}(LPF)'.format(dim_red)] = np.round(dim_red_fn(dim_red, feats['LPF'], n_singv) / round_to) * round_to feats['LPF({})'.format(dim_red)] = np.round( lpf(feats[dim_red], cutoff, sr, order) / round_to) * round_to else: feats['LPF'] = lpf(feats[feature], cutoff, sr, order) feats[dim_red] = dim_red_fn(dim_red, feats[feature], n_singv) feats['{}(LPF)'.format(dim_red)] = dim_red_fn( dim_red, feats['LPF'], n_singv) feats['LPF({})'.format(dim_red)] = lpf(feats[dim_red], cutoff, sr, order) # FFT on all features n_fft = 8 hop_length = 1 for k, v in feats.items(): data = np.array([stft(f, n_fft, hop_length)[1:, :] for f in v.T]) data = data.T data = data.reshape(data.shape[0], data.shape[1]*data.shape[2]) feats['FFT({})'.format(k)] = np.abs(data) ** 2 def compute_distance(i, X, window, step_size): return np.sqrt(np.sum(( X[i:i+window] - X[i+step_size:i+step_size+window]) ** 2)) distances = {} for k, v in feats.items(): distances[k] = np.array(map(functools.partial(compute_distance, X=v, window=window, step_size=step_size), xrange(0, len(v)+1-window*2))) if as_diff: print("\tComputing features as difference") for k, v in feats.items(): feats[k] = np.append([0], np.diff(v)) if medfil_len: print("\tApplying median filter {} to distances".format(medfil_len)) for k, v in distances.items(): distances[k] = medfilt(v, medfil_len) i = 0 j = 0 gs = mpl.gridspec.GridSpec(len(feats), 2, width_ratios=[1, 1]) fig = plt.figure(figsize=(36, 18)) for k in feats.keys(): ts = np.arange(0, len(feats[k])) step_size = max(4, int(len(ts) * .02)) data = feats[k] if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T if 'FFT' in k: step = hop_length * 2 else: step = step_size ax = fig.add_subplot(gs[i, j]) ax.set_title(k) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step]) ax.set_xticklabels(beat_times[::step], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[i+1, j], sharex=ax) ax.set_title('{} Distances'.format(k)) ax.plot(distances[k]) ax.set_xticks(ts[::step]) ax.set_xticklabels(beat_times[::step], rotation=60) ax.grid(False) if j == 1: i += 2 j = (j+1) % 2 plt.tight_layout() plt.savefig("{}_{}_{}_asdiff_{}_wab_{}_r_{}_n_{}_s_{}_{}.png".format( filename, feature, cutoff, as_diff, window, round_to, normalize, scale, dim_red)) plt.close(fig)
#!/usr/bin/env python from librosa.core import stft, istft import numpy as np import scipy y = np.random.rand(44032) stft_matrix = stft(y, window=scipy.signal.hann(2048), hop_length=1024) y_hat = istft(stft_matrix, window=np.ones(2048), hop_length=1024) diff = y - y_hat print np.dot(diff, diff)