def wav2mgcf0(x, order=34, frame_window=512, zerofill_width=1024, shift_window=64, pass_const=0.4, min_pitch=20, max_pitch=500, mgcep_gamma=2): # Convert from int to float32, but keep numbers as integers x = x.astype('float32') # Compute pitch and voicing strength f0 = pysptk.rapt(x.astype(np.float32), 16000, shift_window, otype="pitch", min=min_pitch, max=max_pitch) voicing_str = np.empty((f0.shape[0], 5), dtype=np.float32) for i in range(5): h = h_filters[i] xf = lfilter(h, 1, x) voicing_str[:, i] = pysptk.rapt(xf.astype(np.float32), 16000, shift_window, otype="mixed", min=20, max=500)[:,3] # Compute MGC coefficients mgc_cmd = 'frame -l {} -p {} | window -l {} -L {} | mgcep -m {} -a {} -c {} -l {} -e 0.0012'.format(frame_window, shift_window, frame_window, zerofill_width, order, pass_const, mgcep_gamma, zerofill_width) p = subprocess.Popen(mgc_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True) stdout, stderr = p.communicate(x.tobytes()) mgc = np.fromstring(stdout, dtype='float32').reshape((len(f0), order+1)) try: idx = np.where( abs(voicing_str).sum(axis= 1) > 10)[0][0] except IndexError: idx = len(f0) return mgc[:idx], f0[:idx], voicing_str[:idx]
def test_rapt_regression(): # Grund truth data is generated by: # # $ wav2raw pysptk/example_audio_data/arctic_a0007.wav # # $ x2x +sf ./pysptk/example_audio_data/arctic_a0007.raw | \ # pitch -a 0 -s 16 -p 80 -L 60 -H 240 -o 0 > \ # arctic_a007_p16_L60_H240_o0_rapt.pitch # # $ dmp +f arctic_a007_p16_L60_H240_o0_rapt.pitch | awk '{print $2}' >\ # arctic_a007_p16_L60_H240_o0_rapt.txt # # $ pitch -h # ... # # SPTK: version 3.8 # CVS Info: $Id: pitch.c,v 1.46 2014/12/11 08:30:43 uratec Exp $ ground_truth_path = join(dirname(__file__), "data", "arctic_a007_p16_L60_H240_o0_rapt.txt") with open(ground_truth_path) as f: ground_truth = np.asarray([float(s) for s in [l for l in f.readlines()]]) ground_truth = ground_truth.astype(np.float32) fs, x = wavfile.read(pysptk.util.example_audio_file()) assert fs == 16000 # Since SPTK might have memory corruption bug and the result might be # non-deterministic, test it with multiple time... for _ in range(5): f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=80, min=60, max=240, voice_bias=0.0, otype=0) assert np.allclose(ground_truth, f0)
def pysptk_featurize(audiofile): labels = list() features = list() fs, x = wavfile.read(audiofile) f0_swipe = pysptk.swipe(x.astype(np.float64), fs=fs, hopsize=80, min=60, max=200, otype="f0") features = features + stats(f0_swipe) labels = stats_labels('f0_swipe', labels) f0_rapt = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=80, min=60, max=200, otype="f0") features = features + stats(f0_rapt) labels = stats_labels('f0_rapt', labels) mgc = pysptk.mgcep(xw, 20, 0.0, 0.0) features = features + stats(mgc) labels = stats_labels('mel-spectrum envelope', labels) return features, labels
def test_rapt_regression(): # Grund truth data is generated by: # # $ wav2raw pysptk/example_audio_data/arctic_a0007.wav # # $ x2x +sf ./pysptk/example_audio_data/arctic_a0007.raw | \ # pitch -a 0 -s 16 -p 80 -L 60 -H 240 -o 0 > \ # arctic_a007_p16_L60_H240_o0_rapt.pitch # # $ dmp +f arctic_a007_p16_L60_H240_o0_rapt.pitch | awk '{print $2}' >\ # arctic_a007_p16_L60_H240_o0_rapt.txt # # $ pitch -h # ... # # SPTK: version 3.8 # CVS Info: $Id: pitch.c,v 1.46 2014/12/11 08:30:43 uratec Exp $ ground_truth_path = join(dirname(__file__), "data", "arctic_a007_p16_L60_H240_o0_rapt.txt") with open(ground_truth_path) as f: ground_truth = np.asarray([float(s) for s in [l for l in f.readlines()]]) ground_truth = ground_truth.astype(np.float32) fs, x = wavfile.read(pysptk.util.example_audio_file()) assert fs == 16000 # Since SPTK might have memory corruption bug and the result might be # non-deterministic, test it with multiple time... for i in range(5): f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=80, min=60, max=240, voice_bias=0.0, otype=0) assert np.allclose(ground_truth, f0)
def f0gram(filename,hparams,index): fs, x = wavfile.read(filename) if x.ndim > 1: x = np.mean(x,axis=1) x = librosa.core.resample(x.astype(np.float32),fs,hparams.sample_rate,'kaiser_best') x = x[index[0]:index[1]] f0_rapt = pysptk.rapt(x, fs=hparams.sample_rate, hopsize=256, min=10, max=7600, otype="f0") return f0_rapt
def wav2f0(self, wav_file, f0_dir): sr, x = io.wavfile.read(wav_file) bn = os.path.basename(wav_file) f0_file_path = os.path.join(f0_dir, bn.split('.')[0] + ".f0") f0 = pysptk.rapt(x.astype(np.float32), fs=sr, hopsize=self.hop_length, min=self.pitch_floor, max=self.pitch_ceiling, otype="f0").astype(np.float32) f0.tofile(f0_file_path) return f0_file_path
def process(filename): ''' The function decomposes a wav file into F0, mel-cepstral coefficients, and aperiodicity :param filename: path to wav file :return: .lf0, .mgc and .bap files ''' # pdb.set_trace() file_id = os.path.basename(filename).split(".")[0] print('\n' + file_id) ### WORLD ANALYSIS -- extract vocoder parameters ### # x, fs = librosa.core.load(filename, sr=16000) fs, x = wavfile.read(filename) # warnning this parameter is important alpha = pysptk.util.mcepalpha(fs) hopesize = int(0.005 * fs) # pdb.set_trace() f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=hopesize, min=60, max=600, voice_bias=0.0, otype=1) f0 = f0.astype(np.float64) x = x.astype(np.float64) / (2**15) _, timeaxis = pyworld.harvest(x, fs, frame_period=5, f0_floor=60.0, f0_ceil=600) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) f0 = f0[:, None] lf0 = f0.copy() lf0 = lf0.astype(np.float32) nonzero_indices = np.where(f0 != 0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) zero_indices = np.where(f0 == 0) lf0[zero_indices] = -1.0E+10 write_binfile(lf0, os.path.join(lf0_dir, file_id + '.lf0'), dtype=np.float32) mc = pysptk.sp2mc(spectrogram, mcsize, alpha=alpha) mc = mc.astype(np.float32) write_binfile(mc, os.path.join(mgc_dir, file_id + '.mgc'), dtype=np.float32) bap = pyworld.code_aperiodicity(aperiodicity, fs) bap = bap.astype(np.float32) write_binfile(bap, os.path.join(bap_dir, file_id + '.bap'), dtype=np.float32)
def get_f0(wav, sr, fmin=60, fmax=400, spec_len=None): if wav.dtype == np.float32: wav = wav * 32768.0 f0 = pysptk.rapt(wav, fs=sr, hopsize=args.hop_length, min=fmin, max=fmax, otype='f0') f0norm = normalize(f0, xmin=0, xmax=fmax) if spec_len is not None and spec_len != f0.shape[0]: n_pad = spec_len - f0.shape[0] f0norm = np.pad(f0norm, [0, n_pad]) # pad into spec length f0norm = padding_reduction(f0norm, r=args.r) # f0norm = f0norm[::args.r] return f0norm
def process_file(data, fs, window_len, log): chunk_size = int(window_len / 1000 * fs) pdqs = [] for chunk in [ data[idx:idx + chunk_size] for idx in range(0, len(data), chunk_size) ]: f0 = pysptk.rapt(chunk.astype(np.float32), fs=fs, hopsize=HOP_SIZE, min=60, max=600, otype="f0") f0_positive = f0[f0 > 0] if f0_positive.size != 0: std = np.std(f0_positive) avg = np.average(f0_positive) if avg == 0: pdq = 0 else: pdq = std / avg else: pdq = 0 pdqs.append(float(pdq)) avg_pdq = np.average(pdqs) log(f"Avg pitch quotient: {avg_pdq}") return { 'values': pdqs, 'frame_length': window_len, #ms 'avg': float(avg_pdq) }
def extract_emphasis(self, index_chunk): chunks_path = self.video_path + "chunks/" if (index_chunk <= 9): file = self.chunks_path + "chunk-0" + str(index_chunk) + ".wav" else: file = self.chunks_path + "chunk-" + str(index_chunk) + ".wav" fs, x = wavfile.read(file) assert fs == 16000 f0_swipe = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=20, min=60, otype="pitch") a = [] f = [] X_Frequecies_Vector = [] for w in f0_swipe: if w != 0: f.append(w) pitch = float(np.median(f)) if (np.isnan(pitch)): pitch = 0 #dbs = 20*np.log10( np.sqrt(np.mean(x**2)) ) dbs = 2000 * np.log10(np.sqrt(np.mean( x**2))) / 5 * (self.n_chunks - index_chunk) if (np.isnan(dbs)): dbs = 0 return pitch, dbs
def __test(x, fs, hopsize, min, max, otype): f0 = pysptk.rapt(x, fs, hopsize, min=min, max=max, otype=otype) assert np.all(np.isfinite(f0)) if otype == 1: assert np.all(f0 >= 0)
def feature_extraction(x, fs, feats_df, lp_ord, ID, label): #Extract features from signal x (identified as ID), and concatenate them to dataframe feats_df #Features' reference: (see Appendix) #[1]https://link.springer.com/article/10.1007/s10439-013-0741-6 #[2]https://espace.library.uq.edu.au/data/UQ_344963/s41943203_phd_submission.pdf?dsi_version=c5434db897ab74b192ca295a9eeca041&Expires=1585086202&Key-Pair-Id=APKAJKNBJ4MJBJNC6NLQ&Signature=c8k8DmG~KIxg0ToTO8rebm2MzHneCzJGkjSFRB7BYTEQ-MHXEr0ocHmISrldP3hFf9qmeiL11ezyefcNeRVeKIQ9PVjOl9pn7rXWcjA1o2voPn1VnDd8n7G2cT31apdj0LNMclhlXRPnCsGD66qDRqa3d-xaqqXhEqU73aw3ZgBgroO213MfJOqFhJxxXo2QEia0bSlDRTeX9KhSczFK-IFTPC6GwFL2L04por8pQRI3HF7E3f26O9zp9OhkwxSU9qfJah20WxZLA4PxREdv7JGoVBinR6T0mTcIaQi~B4IzYjSPSsTTADMNk5znVYIvSqgtMT~DY~qwlfq4SRdFjQ__ #do features in a frame-basis x_frames = spe_feats.sigproc.framesig( x, config.frame_len, config.frame_step, config.win_func) #DOUBT: should I use window or not? #at least for formant estimation i should nr_frames = x_frames.shape[0] #print(nr_frames) #0)Wavelets #TODO #DOUBT: if log-energy feature is included, should I also include the first mfcc coefficient (c0) ? #1)mfcc mfcc_feat = spe_feats.mfcc(x, fs, winlen=config.frame_len_s, winstep=config.frame_step_s, numcep=config.cep_num, winfunc=config.win_func) #Apply zscore normalization mfcc_feat = zscore(mfcc_feat, axis=1, ddof=1) #deltas to capture mfcc_delta_feat = spe_feats.delta( mfcc_feat, 1) #mfcc_delta_feat = np.subtract(mfcc_feat[:-1], mfcc_feat[1:]) #same mfcc_deltadelta_feat = spe_feats.delta(mfcc_delta_feat, 1) #2)zero-crossing rate zcr_feat = np.apply_along_axis(get_zcr, 1, x_frames) #3)Formant frequencies #using LP-coeffcs-based method #formant_feat = np.apply_along_axis(get_formants, 1, x_frames, lp_ord, nr_formants) #Note: for the moment, it seems some frames are ill-conditioned for lp computing, #current solution - we skip those and fill with NaN values formants_feat = np.empty((nr_frames, 4)) formants_feat[:] = np.nan for i_frame in range(0, nr_frames): try: formants_feat[i_frame] = get_formants(x_frames[i_frame], config.lp_ord, config.nr_formants) except: pass #4)Log-energy logEnergy_feat = np.apply_along_axis(get_logEnergy, 1, x_frames) #5)Pitch (F0) F0_feat = np.apply_along_axis(get_F0, 1, x_frames, fs) #TODO: compute also F0 with pysptk (a python wrapper for SPTK library), it probably gives better results #https://github.com/r9y9/pysptk/blob/master #F0_feat = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=frame_step, min=50, max=500, ,voice_bias=0.0 ,otype=\"f0\") #Compare the values between swipe and rapt #F0_feat = pysptk.swipe(x.astype(np.float64), fs=fs,hopsize = config.frame_step, min=50, max=500, otype="f0") F0_feat = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=config.frame_step, min=50, max=500, otype="f0") #right frame size??? #Change the window size from 450 to 40 to 100 # Keep swipe , change min to 50 and max - 500 #EXample pysptk.swipe(x.astype(np.float64), fs=fs, hopsize=80, min=60, max=200, otype="f0") #6)Kurtosis kurt_feat = np.apply_along_axis(kurtosis, 1, x_frames) #7)Bispectrum Score (BGS) #TODO: see PhD thesis for more info on this feature #8)Non-Gaussianity Score (NGS) #TODO: see PhD thesis for more info on this feature #9) Adding skewness as measure of non-gaussianity (not in paper) skew_feat = np.apply_along_axis(skew, 1, x_frames) #DOUBT: 10) Shannon entropy GETTING -inf in all cases, WHY??? Don't include until fixed entropy_feat = np.apply_along_axis(get_entropy, 1, x_frames) #TODO: add small value in all entries, this may fix the problem mfcc_cols = ['mfcc_%s' % s for s in range(0, config.cep_num)] mfcc_delta_cols = ['mfcc_d%s' % s for s in range(0, config.cep_num)] mfcc_deltadelta_cols = ['mfcc_dd%s' % s for s in range(0, config.cep_num)] formants_cols = ['F%s' % s for s in range(1, config.nr_formants + 1)] feats_segment = pd.concat([ pd.DataFrame({ 'Id': ID, 'kurt': kurt_feat, 'logEnergy': logEnergy_feat, 'zcr': zcr_feat, 'F0': F0_feat, 'skewness': skew_feat, 'label': label, 'entropy': entropy_feat }), pd.DataFrame(mfcc_feat, columns=mfcc_cols), pd.DataFrame(mfcc_delta_feat, columns=mfcc_delta_cols), pd.DataFrame(mfcc_deltadelta_feat, columns=mfcc_deltadelta_cols), pd.DataFrame(formants_feat, columns=formants_cols) ], axis=1) #print(nr_frames) feats_df = feats_df.append(feats_segment, ignore_index=True, sort=False) return feats_df
order = 34 frame_window = 512 zerofill_width = 1024 shift_window = 64 pass_const = 0.4 min_pitch = 20 max_pitch = 500 mgcep_gamma = 2 e = 0.0012 #-a 0 -s 16 SPTK.rapt(raw.astype(np.float32), fs=16000, hopsize=shift_window, min=min_pitch, max=max_pitch, otype="pitch") f0[1200:1300] import pipes, os, subprocess, tempfile import numpy as np frame_cmd = 'frame -l {} -p {}'.format(frame_window, shift_window) raw2 = raw.astype('float32') #raw2 = np.arange(5000).astype('float32') p = subprocess.Popen(frame_cmd,
mgc2, f02 = wav2mgcf0(raw) order=34 frame_window=512 zerofill_width=1024 shift_window=64 pass_const=0.4 min_pitch=20 max_pitch=500 mgcep_gamma=2 e = 0.0012 #-a 0 -s 16 SPTK.rapt(raw.astype(np.float32), fs=16000, hopsize=shift_window, min=min_pitch, max=max_pitch, otype="pitch") f0[1200:1300] import pipes, os, subprocess, tempfile import numpy as np frame_cmd = 'frame -l {} -p {}'.format(frame_window, shift_window) raw2 = raw.astype('float32') #raw2 = np.arange(5000).astype('float32') p = subprocess.Popen(frame_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True) stdout, stderr = p.communicate(raw2.tobytes())
def get_f0(waveform, sample_rate, hop_length_seconds=0.01, method='swipe', f0_min=60, f0_max=300): """Compute the F0 contour using PYSPTK: https://github.com/r9y9/pysptk/. Args: waveform (np.array, [T, ]): waveform over which to compute f0 sample_rate (int > 0): number of samples per second in waveform hop_length (int): hop size argument in pysptk.swipe. Corresponds to hopsize in the window sliding of the computation of f0. method (str): is one of 'swipe' or 'rapt'. Define which method to use for f0 calculation. See https://github.com/r9y9/pysptk Returns: dict: Dictionary containing keys: "contour" (np.array, [1, t1]): f0 contour of waveform. Contains unvoiced frames. "values" (np.array, [1, t2]): nonzero f0 values waveform. Note that this discards all unvoiced frames. Use to compute mean, std, and other statistics. "mean" (float): mean of the f0 contour. "std" (float): standard deviation of the f0 contour. """ assert method in ( 'swipe', 'rapt'), "The method argument should be one of 'swipe' or 'rapt'." hop_length = numseconds_to_numsamples(hop_length_seconds, sample_rate) if method == 'swipe': f0_contour = swipe( waveform.astype(np.float64), fs=sample_rate, hopsize=hop_length, min=f0_min, max=f0_max, otype="f0", )[np.newaxis, :] elif method == 'rapt': # For this estimation, waveform needs to be in the int PCM format. f0_contour = rapt( np.round(waveform * 32767).astype(np.float32), fs=sample_rate, hopsize=hop_length, min=f0_min, max=f0_max, otype="f0", )[np.newaxis, :] # Remove unvoiced frames. f0_values = f0_contour[:, np.where(f0_contour[0, :] != 0)][0] f0_mean = np.mean(f0_values[0]) f0_std = np.std(f0_values[0]) return { "contour": f0_contour, "values": f0_values, "mean": f0_mean, "std": f0_std, }