def tst_sig_chunks(): len = 13 chunk_size = 8 olap = 4 sig = numpy.random.randn(1, len) print(sig.shape) print(sig) # case 0 res = utils_sig.cut_sig_into_chunks(sig, chunk_size, overlap_size=olap, pad_zeros=False) print(res.shape) print(res) # case 1 res = utils_sig.cut_sig_into_chunks(sig, chunk_size, overlap_size=olap) print(res.shape) print(res) # case 2 res = utils_sig.cut_sig_into_chunks(sig, chunk_size) print(res.shape) print(res) # case 3 res = utils_sig.cut_sig_into_chunks(sig, chunk_size, pad_zeros=False) print(res.shape) print(res)
def run_main(): if len(sys.argv) <= 1: raise Exception("Need to specify input wav-file to process") wavname = sys.argv[1] if not os.path.exists(wavname): raise Exception( "Specified wavfile {0} does not seem to exist!".format(wavname)) print("Will process file {0}".format(wavname)) (samplerate, signal) = wav.read(wavname) sampleperiod = 1.0 / samplerate signal = signal.reshape((-1, 1)) fft_size = 256 nfilters = 15 signal = utils_sig.pad_to_multiple_of(signal, fft_size, 0.0) sigchunks = utils_sig.cut_sig_into_chunks(signal.T, fft_size) spec_envs = utils_sp.get_spec_envelopes(sigchunks) # EXAMPLE: #scipy.signal.spectrogram(x, fs=1.0, window=('tukey', 0.25), nperseg=None, noverlap=None, nfft=None, detrend='constant', return_onesided=True, scaling='density', axis=-1, mode='psd') freq_grid, time_grid, sgram = sig.spectrogram(signal.squeeze(), fs=samplerate, window=sig.get_window( 'boxcar', fft_size), nperseg=fft_size, noverlap=0, nfft=fft_size, scaling='spectrum', mode='magnitude') sgram = sgram.T #print(sgram.shape) #print(sgram.dtype) #print(spec_envs.shape) #print(spec_envs.dtype) sgram.tofile('./tmp/py_sgram.bin') spec_envs.tofile('./tmp/my_sgram.bin')
def run_main_sgram_env(): if len(sys.argv) <= 1: raise Exception("Need to specify input wav-file to process") wavname = sys.argv[1] if not os.path.exists(wavname): raise Exception( "Specified wavfile {0} does not seem to exist!".format(wavname)) print("Will process file {0}".format(wavname)) (samplerate, signal) = wav.read(wavname) sampleperiod = 1.0 / samplerate signal = signal.reshape((-1, 1)) fft_size = 64 nfilters = 15 signal = utils_sig.pad_to_multiple_of(signal, fft_size, 0.0) sigchunks = utils_sig.cut_sig_into_chunks(signal.T, fft_size) spec_envs = utils_sp.get_spec_envelopes(sigchunks) fbank_envs = utils_sp.get_mel_fb_curves(spec_envs, samplerate, nfilters) timestep = float(fft_size) / float(samplerate) (fbank_envs_py, _) = psf.fbank(signal, samplerate=samplerate, winlen=timestep, winstep=timestep, nfilt=nfilters, nfft=fft_size, lowfreq=0, highfreq=None, preemph=0) SIG_DUR = sampleperiod * signal.shape[0] SIG_X = numpy.arange(0, SIG_DUR, sampleperiod) dfb, D_FB_X, _ = estimate_sc_from_envelopes(fbank_envs, samplerate, fft_size) #utils_plot.simple_plot(signal, SIG_X) #utils_plot.plot_curves( [signal, fbank_envs[:,1]], [SIG_X, FB_X] ) #utils_plot.plot_curves([signal, deriv], [SIG_X, D_FB_X]) utils_plot.plot_curves([signal, dfb], [SIG_X, D_FB_X])
def run_main(): if len(sys.argv) <= 1: raise Exception("Need to specify input wav-file to process") wavname = sys.argv[1] if not os.path.exists(wavname): raise Exception( "Specified wavfile {0} does not seem to exist!".format(wavname)) print("Will process file : {0}".format(wavname)) (samplerate, signal) = wav.read(wavname) sampleperiod = 1.0 / samplerate signal = signal.reshape((-1, 1)) crest_frame_size = 2048 sflat_frame_size = 1024 sflat_fft_size = int(2**numpy.ceil(numpy.log2(sflat_frame_size))) signal_for_crest = utils_sig.pad_to_multiple_of(signal, crest_frame_size, 0.0) crestchunks = utils_sig.cut_sig_into_chunks(signal_for_crest.T, crest_frame_size) crestfactor_vals = utils_td.get_crest_from_chunks(crestchunks) freq_grid, time_grid, sgram = sig.spectrogram(signal.squeeze(), fs=samplerate, window=sig.get_window( 'boxcar', sflat_frame_size), nperseg=sflat_frame_size, noverlap=0, nfft=sflat_fft_size, scaling='spectrum', mode='magnitude') sgram = sgram.T flatness = utils_sp.calc_spec_gram_flatness(sgram) crestfactor_vals.tofile('./tmp/crest_vals.bin') flatness.tofile('./tmp/flatness_vals.bin')
def run_main(): if len(sys.argv) <= 1: raise Exception("Need to specify input wav-file to process") wavname = sys.argv[1] if not os.path.exists(wavname): raise Exception("Specified wavfile {0} does not seem to exist!".format(wavname)) print("Will process file {0}".format(wavname)) (samplerate, signal) = wav.read(wavname) sampleperiod = 1.0 / samplerate signal = signal.reshape( (-1, 1) ) fft_size = 256 nfilters = 15 signal = utils_sig.pad_to_multiple_of(signal, fft_size, 0.0) sigchunks = utils_sig.cut_sig_into_chunks(signal.T, fft_size) spec_envs = utils_sp.get_spec_envelopes(sigchunks) fbank_envs = utils_sp.get_mel_fb_curves(spec_envs, samplerate, nfilters) timestep = float(fft_size) / float(samplerate) (fbank_envs_py, _) = psf.fbank(signal,samplerate=samplerate,winlen=timestep,winstep=timestep, nfilt=nfilters,nfft=fft_size,lowfreq=0,highfreq=None,preemph=0) #simple_plot(signal, numpy.arange(signal.shape[0]) * sampleperiod) #simple_plot(fbank_envs[30,:]) #simple_plot(fbank_envs_py[30,:]) print(fbank_envs.shape) print(fbank_envs_py.shape) print(fbank_envs.dtype) print(fbank_envs_py.dtype) fbank_envs.tofile('./tmp/my_fbank.bin') fbank_envs_py.tofile('./tmp/py_fbank.bin')
def run_emp_detect_type2(wavfile, config, silent=True): # ATTENTION!!! CURRENTLY NOT IMPLEMENTED!!! # need to work with # (DONE) 1) spectral change (basically envelope stability sort of) # (DONE) 2) peak-to-peak rate # (DONE) 3) syllable duration (basically a non-interrupted pitch segment) # (DONE) 4) pitch maxima (probably relatively to it's average value) (samplerate, signal) = wav.read(wavfile) signal = signal - numpy.mean( signal) # just in case, cause some inputs are really screwed sampleperiod = 1.0 / samplerate signal_time = numpy.arange(len(signal)) * sampleperiod signal = signal.reshape((-1, 1)) signal = signal / (2.0**15.0) signal_no_sil = utils_td.remove_silence(signal, 0.0001) std_no_sil = numpy.std(signal_no_sil) rms_no_sil = utils_td.get_rms(signal_no_sil) #print(numpy.mean(signal_no_sil)) #print(std_no_sil) #print(rms_no_sil) chunk_nsamples = int(config['chunk_size_samples']) olap_nsamples = int(config['overlap_samples']) fft_size = int(2**numpy.ceil(numpy.log2(chunk_nsamples))) env_size = int(fft_size / 2 + 1) sig_chunks = utils_sig.cut_sig_into_chunks(signal.T, chunk_nsamples, overlap_step=olap_nsamples, pad_zeros=True) sig_chunks_num = sig_chunks.shape[0] sig_chunks_tstep = olap_nsamples / samplerate sig_chunks_time = numpy.arange(sig_chunks_num) * sig_chunks_tstep wrld_res = run_world_by_reaper(wavfile, config['wrk_path'], config['reaper_path'], config['world_path']) sig_f0 = numpy.fromfile(wrld_res[0]).reshape((-1, 1)) sig_sp = numpy.fromfile(wrld_res[1]).reshape((env_size, -1)) sig_f0_time = numpy.arange(sig_f0.shape[0]) * config['f0_time_step'] sig_f0_no_sil = utils_td.remove_silence(sig_f0, 0.0) f0_std_no_sil = numpy.std(sig_f0_no_sil) #print(sig_f0.shape) #print(sig_sp.shape) ### =================== PITCH EXTREMUMS f0_extr = utils_pitch.get_f0_extreme_areas( sig_f0, config['f0_extr_thr'], config['f0_extr_len'] / config['f0_time_step']) f0_low = f0_extr[0] * (f0_extr[0] > 0).astype('int') f0_high = f0_extr[1] * (f0_extr[1] > 0).astype('int') f0_extr = utils_td.perform_mvn_norm((f0_low + f0_high), skip_zeros=True) # MY_DBG #utils_plot.plot_curves( [sig_f0 / numpy.max(sig_f0), f0_low, f0_high]) ### =================== SPECTRAL CHANGE freq_step = samplerate / fft_size band_idx = [ int(numpy.round(CFG['spec_change_band_st'] / freq_step)), int(numpy.round(CFG['spec_change_band_end'] / freq_step)) ] (sc_dfb, sc_time) = estimate_sc_from_envelopes(sig_sp.T, samplerate, 0.005 * samplerate, band=band_idx) # MY_DBG #utils_plot.plot_curves([signal, sc_res], [signal_time, sc_time]) ### =================== PEAK-to_PEAK p2p = utils_td.get_peak_to_peak_from_chunks(sig_chunks) # MY_DBG #utils_plot.plot_curves( [signal, p2p], [signal_time, sig_chunks_time]) ### =================== VOICED MASK voiced = (sig_f0.squeeze() > 0.0).astype('int') # MY_DBG #utils_plot.plot_curves( [sig_f0 / numpy.max(sig_f0), voiced], [sig_f0_time, sig_f0_time]) # PEAK-TO-CHANGE p2p_int = numpy.interp(signal_time.squeeze(), sig_chunks_time.squeeze(), p2p.squeeze()) sc_dfb_int = numpy.interp(signal_time.squeeze(), sc_time.squeeze(), sc_dfb.squeeze()) p2sc = numpy.log(p2p_int) - numpy.log(sc_dfb_int) p2sc = utils_sig.clean_undef_floats(p2sc) ### =================== FINALIZING RESULTS DETECT_VO = numpy.interp(signal_time.squeeze(), sig_f0_time.squeeze(), voiced.squeeze()) DETECT_EX = numpy.interp(signal_time.squeeze(), sig_f0_time.squeeze(), f0_extr.squeeze()) DETECT_P2SC = p2sc > config['peak2change_thr'] RESULT_MASK = (DETECT_P2SC > 0) * (DETECT_VO > 0) * (DETECT_EX > 0) #RESULT_MASK = update_detection_results(RESULT_MASK, samplerate, config['detect_hysteresis'], # config['detect_merge_threshold']) # one more time make sure unvoiced segs are not detected #RESULT_MASK = RESULT_MASK * (DETECT_VO > 0) #if not silent: # utils_plot.plot_curves( [signal, RESULT_MASK], [signal_time, signal_time]) SC_DFB = numpy.interp(signal_time.squeeze(), sc_time.squeeze(), sc_dfb.squeeze()) P2P = numpy.interp(signal_time.squeeze(), sig_chunks_time.squeeze(), p2p.squeeze()) dbg_stuff = { 'peak2schange_detect': DETECT_P2SC, 'voiced_detect': DETECT_VO, 'f0-extreme_detect': DETECT_EX, 'threshold_base': std_no_sil, 'spec_change': SC_DFB, 'peak2peak': P2P, 'peak2schange': p2sc } return (RESULT_MASK, signal_time, dbg_stuff)
def run_emp_detect_type1(wavfile, config, silent=True, reuse_data=False): # (DONE) 1) spectral change (basically envelope stability sort of) # (DONE) 2) peak-to-peak rate # (DONE) 3) syllable duration (basically a non-interrupted pitch segment) # (DONE) 4) pitch maxima (probably relatively to it's average value) (samplerate, signal) = wav.read(wavfile) signal = signal - numpy.mean( signal) # just in case, cause some inputs are really screwed sampleperiod = 1.0 / samplerate signal_time = numpy.arange(len(signal)) * sampleperiod signal = signal.reshape((-1, 1)) signal = signal / (2.0**15.0) signal_no_sil = utils_td.remove_silence(signal, 0.0001) std_no_sil = numpy.std(signal_no_sil) rms_no_sil = utils_td.get_rms(signal_no_sil) # MY_DBG #print(numpy.mean(signal_no_sil)) #print(std_no_sil) #print(rms_no_sil) # MY_DBG #utils_plot.plot_curves([signal], [signal_time]) #input('eat a dick!') chunk_nsamples = int(config['chunk_size_samples']) olap_nsamples = int(config['overlap_samples']) fft_size = int(2**numpy.ceil(numpy.log2(chunk_nsamples))) env_size = int(fft_size / 2 + 1) sig_chunks = utils_sig.cut_sig_into_chunks(signal.T, chunk_nsamples, overlap_step=olap_nsamples, pad_zeros=True) sig_chunks_num = sig_chunks.shape[0] sig_chunks_tstep = olap_nsamples / samplerate sig_chunks_time = numpy.arange(sig_chunks_num) * sig_chunks_tstep if reuse_data: print( "Trying to reuse previously-calculated data for file {0} in dir {1} ..." .format(wavfile, config['wrk_path'])) wrld_res = utils_world.try_search_previous_world_results( wavfile, config['wrk_path']) else: wrld_res = utils_world.run_world_by_reaper(wavfile, config['wrk_path'], config['reaper_path'], config['world_path']) if (wrld_res[0] is None or wrld_res[1] is None or wrld_res[2] is None): raise Exception('LEFUCKUP') sig_f0 = numpy.fromfile(wrld_res[0]).reshape((-1, 1)) sig_sp = numpy.fromfile(wrld_res[1]).reshape((env_size, -1)) sig_f0_time = numpy.arange(sig_f0.shape[0]) * config['f0_time_step'] sig_f0_no_sil = utils_td.remove_silence(sig_f0, 0.0) f0_std_no_sil = numpy.std(sig_f0_no_sil) #print(sig_f0.shape) #print(sig_sp.shape) ### =================== PITCH EXTREMUMS f0_extr = utils_pitch.get_f0_extreme_areas( sig_f0, config['f0_extr_thr'], config['f0_extr_len'] / config['f0_time_step']) f0_low = f0_extr[0] * (f0_extr[0] > 0).astype('int') f0_high = f0_extr[1] * (f0_extr[1] > 0).astype('int') f0_extr = utils_td.perform_mvn_norm((f0_low + f0_high), skip_zeros=True) # MY_DBG #utils_plot.plot_curves( [sig_f0 / numpy.max(sig_f0), f0_low, f0_high]) ### =================== SPECTRAL CHANGE freq_step = samplerate / fft_size band_idx = [ int(numpy.round(config['spec_change_band_st'] / freq_step)), int(numpy.round(config['spec_change_band_end'] / freq_step)) ] (sc_dfb, sc_time) = utils_spec.estimate_sc_from_envelopes(sig_sp.T, samplerate, 0.005 * samplerate, band=band_idx) sc_res = (numpy.abs(sc_dfb) < config['spec_change_threshold'] * std_no_sil).astype('float') sc_res = sc_dfb * sc_res # MY_DBG #utils_plot.plot_curves([signal, sc_res], [signal_time, sc_time]) ### =================== PEAK-to_PEAK p2p = utils_td.get_peak_to_peak_from_chunks(sig_chunks) p2p_det = (p2p > config['peak_to_peak_thr_std'] * std_no_sil).astype('float') * p2p # MY_DBG #utils_plot.plot_curves( [signal, p2p], [signal_time, sig_chunks_time]) ### =================== VOICED MASK voiced = (sig_f0.squeeze() > 0.0).astype('int') # MY_DBG #utils_plot.plot_curves( [sig_f0 / numpy.max(sig_f0), voiced], [sig_f0_time, sig_f0_time]) ### =================== FINALIZING RESULTS DETECT_SC = numpy.interp(signal_time.squeeze(), sc_time.squeeze(), sc_res.squeeze()) DETECT_VO = numpy.interp(signal_time.squeeze(), sig_f0_time.squeeze(), voiced.squeeze()) DETECT_PP = numpy.interp(signal_time.squeeze(), sig_chunks_time.squeeze(), p2p_det.squeeze()) DETECT_EX = numpy.interp(signal_time.squeeze(), sig_f0_time.squeeze(), f0_extr.squeeze()) RESULT_MASK = (DETECT_SC > 0) * (DETECT_VO > 0) * (DETECT_PP > 0) * (DETECT_EX > 0) RESULT_MASK = update_detection_results(RESULT_MASK, samplerate, config['detect_hysteresis'], config['detect_merge_threshold'], config['detect_min_len'], config['detect_max_len']) scan_seg_len = int(config['scan_region_len'] * samplerate) if scan_seg_len % 2 == 0: scan_seg_len += 1 scan_segs = position_scan_regions(signal.squeeze(), RESULT_MASK, scan_seg_len, config['detect_scan_min_olap']) # MY_DBG #utils_plot.plot_emphasis_scan_segs(signal.squeeze(), RESULT_MASK, scan_segs, samplerate) #input('some input') # MY_DBG #utils_plot.plot_curves([signal], [signal_time]) #utils_plot.plot_curves([signal, RESULT_MASK], [signal_time, signal_time]) #input('eat a dick!') # one more time make sure unvoiced segs are not detected RESULT_MASK = RESULT_MASK * (DETECT_VO > 0) #if not silent: # #utils_plot.plot_curves( [signal, RESULT_MASK], [signal_time, signal_time]) # utils_plot.plot_emphasis_scan_segs(signal.squeeze(), RESULT_MASK, scan_segs, # samplerate) SC_DFB = numpy.interp(signal_time.squeeze(), sc_time.squeeze(), sc_dfb.squeeze()) P2P = numpy.interp(signal_time.squeeze(), sig_chunks_time.squeeze(), p2p.squeeze()) dbg_stuff = { 'spec_change_detect': DETECT_SC, 'voiced_detect': DETECT_VO, 'peak2peak_detect': DETECT_PP, 'f0-extreme_detect': DETECT_EX, 'threshold_base': std_no_sil, 'spec_change': SC_DFB, 'peak2peak': P2P } return (RESULT_MASK, signal_time, scan_segs, dbg_stuff)
def run_main(): ARGS = parse_input_args() # DBG #print(ARGS) wavname = ARGS.i outname = ARGS.o if not os.path.exists(wavname): raise Exception( "Specified wavfile {0} does not seem to exist!".format(wavname)) print("Will process file : {0}".format(wavname)) print("Will write result to : {0}".format(outname)) (samplerate, signal) = wav.read(wavname) sampleperiod = 1.0 / samplerate signal = signal.reshape((-1, 1)) crest_frame_size = int(samplerate * float(ARGS.crest_size) / 1000.0) sflat_frame_size = int(samplerate * float(ARGS.sflat_size) / 1000.0) sflat_fft_size = int(2**numpy.ceil(numpy.log2(sflat_frame_size))) signal_for_crest = utils_sig.pad_to_multiple_of(signal, crest_frame_size, 0.0) crestchunks = utils_sig.cut_sig_into_chunks(signal_for_crest.T, crest_frame_size) crestfactor_vals = utils_td.get_crest_from_chunks(crestchunks) freq_grid, time_grid, sgram = sig.spectrogram(signal.squeeze(), fs=samplerate, window=sig.get_window( 'boxcar', sflat_frame_size), nperseg=sflat_frame_size, noverlap=0, nfft=sflat_fft_size, scaling='spectrum', mode='magnitude') sgram = sgram.T flatness = utils_sp.calc_spec_gram_flatness(sgram) crestperiod = crest_frame_size * sampleperiod sflatperiod = sflat_frame_size * sampleperiod SIG_X = numpy.arange(0, sampleperiod * len(signal), sampleperiod) CREST_X = numpy.arange(0, crestperiod * len(crestfactor_vals), crestperiod) SFLAT_X = numpy.arange(0, sflatperiod * len(flatness), sflatperiod) CREST_Y = numpy.interp(SIG_X, CREST_X, crestfactor_vals.squeeze()) SFLAT_Y = numpy.interp(SIG_X, SFLAT_X, flatness.squeeze()) # DBG #simple_plot(signal, SIG_X) #simple_plot(CREST_Y, SIG_X) #simple_plot(SFLAT_Y, SIG_X) RESULT_MASK = (CREST_Y > ARGS.crest_thr) * ( SFLAT_Y >= ARGS.sflat_thr_down) * (SFLAT_Y <= ARGS.sflat_thr_up) #DBG utils_plot.simple_plot(RESULT_MASK, SIG_X) RESULT_MASK = RESULT_MASK.astype('float32') RESULT_MASK.tofile(outname)