def _encode_data(args): filename, mode, sr = args data, sr = librosa.core.load(filename, sr=sr) data, _ = librosa.effects.trim(data, top_db=15) duration = librosa.get_duration(y=data, sr=sr) if mode.name.startswith('spec'): data = np.log(abs(librosa.core.stft(y=data, n_fft=2**11))**2) data = data[..., np.newaxis] glog.debug('spec:: %s, %s', data.shape, data) elif mode.name.startswith('ssc'): data = ssc(data, sr, **SSC_CONFIG) data = data[..., np.newaxis] glog.debug('mfcc:: %s, %s', data.shape, data) elif mode.name.startswith('mfcc'): data = mfcc(data, sr, **MFCC_CONFIG) if mode == DataMode.mfcc_delta: data_delta = delta(data, 1) data = np.append(data, data_delta, axis=-1) elif mode == DataMode.mfcc_ssc: data = ssc(data, sr, **SSC_CONFIG) data = np.append(data, data_delta, axis=-1) data = data[..., np.newaxis] glog.debug('mfcc:: %s, %s', data.shape, data) elif mode == DataMode.fbank: data = logfbank(data, sr, **FBANK_CONFIG) data = data[..., np.newaxis] glog.debug('fbank:: %s, %s', data.shape, data) else: assert False, 'Invald option:: %s' % mode return data, duration
def mfcc_loop(n_first, n_last, grade): a = [] b = [] for i in range(n_first, n_last): (rate, sig) = wav.read( "C:\Work\speech_recognition\{}\sample_{}.wav".format(grade, i)) a.append([i for i in ssc(sig, rate, nfft=1103)]) b.append([i for i in logfbank(sig, rate, nfft=1103)]) return a, b
def extract_from_signal(fs, signal, nfft): mfcc = psf.mfcc(signal, fs, nfft=nfft) fbank = psf.fbank(signal, fs, nfft=nfft)[0] logfbank = psf.logfbank(signal, fs, nfft=nfft) ssc = psf.ssc(signal, fs, nfft=nfft) mfcc_mean = [mfcc[:, i].mean() for i in xrange(mfcc.shape[1])] mfcc_std = [mfcc[:, i].std() for i in xrange(mfcc.shape[1])] fbank_mean = [fbank[:, i].mean() for i in xrange(fbank.shape[1])] fbank_std = [fbank[:, i].std() for i in xrange(fbank.shape[1])] logfbank_mean = [logfbank[:, i].mean() for i in xrange(logfbank.shape[1])] logfbank_std = [logfbank[:, i].std() for i in xrange(logfbank.shape[1])] ssc_mean = [ssc[:, i].mean() for i in xrange(ssc.shape[1])] ssc_std = [ssc[:, i].std() for i in xrange(ssc.shape[1])] return mfcc_mean + mfcc_std + fbank_mean + fbank_std + logfbank_mean + logfbank_std + ssc_mean + ssc_std #signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98) frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x, )), zero_padding=True) power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512) mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) # power_spectrum_mean = [power_spectrum[:, i].mean() for i in xrange(power_spectrum.shape[1])] # power_spectrum_std = [power_spectrum[:, i].std() for i in xrange(power_spectrum.shape[1])] mfcc_mean = [mfcc[:, i].mean() for i in xrange(mfcc.shape[1])] mfcc_std = [mfcc[:, i].std() for i in xrange(mfcc.shape[1])] # logenergy_mean = [logenergy[:, i].mean() for i in xrange(logenergy.shape[1])] # logenergy_std = [logenergy[:, i].std() for i in xrange(logenergy.shape[1])] return mfcc_mean + mfcc_std
def extract_features(audio, rate): """extract 20 dim mfcc features from an audio, performs CMS and combines delta to make it 40 dim feature vector""" mfcc_feature = mfcc.mfcc(audio, rate, 0.025, 0.01, 26, nfft=1200, preemph=0.97, appendEnergy=True) # mfcc_feature = preprocessing.scale(mfcc_feature) mfcc_feature1 = mfcc.logfbank(audio, rate, 0.025, 0.01, 26, nfft=1200) mfcc_feature2 = mfcc.ssc(audio, rate, 0.025, 0.01, 26, nfft=1200) delta = mfcc.delta(mfcc_feature, 26) combined = np.hstack((mfcc_feature, delta, mfcc_feature1, mfcc_feature2)) return combined
def get_feature_from_python_speech_features(wave_name): from python_speech_features import logfbank from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import fbank from python_speech_features import ssc import scipy.io.wavfile as wav import numpy (rate, sig) = wav.read(wave_name) mfcc_feat = mfcc(sig, rate) d_mfcc_feat = delta(mfcc_feat, 2) d_d_mfcc_feat = delta(d_mfcc_feat, 2) fbank_feat, energy = fbank(sig, rate) logfbank_feat = logfbank(sig, rate) centroids = ssc(sig, rate) feat = numpy.hstack( (mfcc_feat, d_mfcc_feat, d_d_mfcc_feat, logfbank_feat, centroids)) return feat.T #一行代表一帧的特征
def makeMFCC(name, sampleSize): rate = [[] for i in range(sampleSize)] sig = [[] for i in range(sampleSize)] mfcc_feat = [[] for i in range(sampleSize)] fbank_feat = [[] for i in range(sampleSize)] for i in range(sampleSize): wordIs = name word = wordIs + str(i) + ".wav" (rate[i], sig[i]) = wav.read(word) mfcc_featI = mfcc(sig[i], rate[i], nfft=1103) fbank_featI = logfbank(sig[i], rate[i], nfft=1103) ssc_featI = ssc(sig[i], rate[i], nfft=1103) for j in ssc_featI: fbank_feat[i].append(np.average(j)) plt.figure() plt.plot(fbank_feat) plt.savefig("Result/MFCCaverage.png", dpi=300) return fbank_feat
def get_ssc_feat( file_path, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=26, nfft=2048, lowfreq=0, highfreq=None, preemph=0.97): """Get Spectral Subband Centroid features given a signal path. @param: file_path – file path of the signal. @param: samplerate – the samplerate of the signal we are working with. @param: winlen – the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) @param: winstep – the step between successive windows in seconds. Default is 0.01s (10 milliseconds) @param: nfilt – the number of filters in the filterbank, default 26. @param: nfft – the FFT size. Default is 512. @param: lowfreq – lowest band edge of mel filters. In Hz, default is 0. @param: highfreq – highest band edge of mel filters. In Hz, default is samplerate/2 @param: preemph – apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. @return: tuple of feature vector (999, 26) (num_frames, nfilt) """ sample_rate, signal = wf.read(file_path) feat_vec = ssc(signal, sample_rate, winlen, winstep, nfilt, \ nfft, lowfreq, highfreq, preemph) return feat_vec
def featuresExtraction_temp(request): rate, data = wavfile.read("file.wav") F_vectors, f_names = audioFeatureExtraction.stFeatureExtraction( data, rate, 0.050 * rate, 0.025 * rate) f_vectors1 = logfbank(data, rate) f_vectors3 = ssc(data, rate) F_vectors = np.transpose(F_vectors) F_vectors = np.array((F_vectors)) length = F_vectors.shape[0] F_vectors = list(F_vectors) for i in range(length): F_vectors[i] = list(F_vectors[i]) f_vectors1[i] = list(f_vectors1[i]) F_vectors[i].extend(f_vectors1[i]) f_vectors3[i] = list(f_vectors3[i]) F_vectors[i].extend(f_vectors3[i]) print(len(F_vectors)) print(len(F_vectors[0])) print(length) username = request.user.username name = user_list.objects.get(user_name=str(username)) userid = name.id for j in range(length): print("j>>", j) features = voiceFeatures_temp.objects.create(user_name=str(username), user_id=userid, frame_index=j) features = voiceFeatures_temp.objects.get(user_name=str(username), user_id=userid, frame_index=j) for i in range(86): exec("features.f%d=F_vectors[%d][%d]" % (i, j, i)) features.save() file = 'test_v.csv' table = 'mainpage_voiceFeatures_temp' convert_modeltocsv_voice(userid, userid, file, table)
def extract_features(): # Some variable and path initializations df = pd.DataFrame(columns=['mfcc_feat', 'fbank_feat', 'ssc'], index=range(0, 1000)) y_df = pd.DataFrame(columns=['classification'], index=range(0, 1000)) # Getting all the genres l = [] for dirpaths, dirnames, filenames in os.walk(os.getcwd()): l.append(dirnames) song_no = 0 # Extracting Features of the songs for i in l[0]: #for genre in genres1 for x in range(100): # for song in batch of songs # Extracting the features (rate, sig) = wav.read(i + "/" + i + ".000" + "%02d" % x + '.wav') mfcc_feat = mfcc(sig, rate, nfft=551) fbank_feat = logfbank(sig, rate, nfft=551) sig_temp = np.reshape(sig, (sig.shape[0], 1)) ssc_var = ssc(sig_temp, rate, nfft=551) # Adding features to the pandas dataframe -- all 2985 frames df.iloc[song_no][0] = mfcc_feat[0:2985, :] df.iloc[song_no][1] = fbank_feat[0:2985, :] df.iloc[song_no][2] = ssc_var[0:2985, :] y_df.iloc[song_no][0] = i # Incrementing the song number\n", song_no += 1 return df, y_df
def featuresExtraction(username1): print("sucessssssssssssssss") rate, data = wavfile.read("file.wav") F_vectors, f_names = audioFeatureExtraction.stFeatureExtraction( data, rate, 0.050 * rate, 0.025 * rate) f_vectors1 = logfbank(data, rate) f_vectors3 = ssc(data, rate) F_vectors = np.transpose(F_vectors) F_vectors = np.array((F_vectors)) length = F_vectors.shape[0] F_vectors = list(F_vectors) for i in range(length): F_vectors[i] = list(F_vectors[i]) f_vectors1[i] = list(f_vectors1[i]) F_vectors[i].extend(f_vectors1[i]) f_vectors3[i] = list(f_vectors3[i]) F_vectors[i].extend(f_vectors3[i]) print(len(F_vectors)) print(len(F_vectors[0])) print(length) username = username1['username1'] name = user_list.objects.get(user_name=str(username)) userid = name.id for j in range(length): print("j>>", j) features = voiceFeatures.objects.create(user_name=str(username), user_id=userid, frame_index=j) features = voiceFeatures.objects.get(user_name=str(username), user_id=userid, frame_index=j) for i in range(86): exec("features.f%d=F_vectors[%d][%d]" % (i, j, i)) features.save()
def extract_ssc(self, y, sr, cmn=False): feat = ssc(y, sr, winfunc=np.hamming, **self.ssc_kwargs) if cmn: feat -= np.mean(feat, axis=0, keepdims=True) return feat.astype('float32')
def get_ssc(signal, rate): return ssc(signal, rate)
def featurex(filepath): # print(filepath) (rate, X) = wav.read(filepath) ceps = mfcc(X, rate) delt = delta(ceps, 2) sscz = ssc(X, rate) filt = delta(delt, 2) ls = [] for i in range(ceps.shape[1]): temp = ceps[:, i] lfeatures = [ np.mean(temp), np.var(temp), np.amax(temp), np.amin(temp), scipy.stats.kurtosis(temp), scipy.stats.skew(temp), scipy.stats.iqr(temp) ] temp2 = np.array(lfeatures) ls.append(temp2) ls2 = [] for i in range(delt.shape[1]): dtemp = delt[:, i] dlfeatures = [ np.mean(dtemp), np.var(dtemp), np.amax(dtemp), np.amin(dtemp), scipy.stats.kurtosis(dtemp), scipy.stats.skew(dtemp), scipy.stats.iqr(dtemp) ] dtemp2 = np.array(dlfeatures) ls2.append(dtemp2) ls3 = [] for i in range(sscz.shape[1]): stemp = sscz[:, i] slfeatures = [ np.mean(stemp), np.var(stemp), np.amax(stemp), np.amin(stemp), scipy.stats.kurtosis(stemp), scipy.stats.skew(stemp), scipy.stats.iqr(stemp) ] stemp3 = np.array(slfeatures) ls3.append(stemp3) ls4 = [] for i in range(filt.shape[1]): ftemp = filt[:, i] flfeatures = [ np.mean(ftemp), np.var(ftemp), np.amax(ftemp), np.amin(ftemp), scipy.stats.kurtosis(ftemp), scipy.stats.skew(ftemp), scipy.stats.iqr(ftemp) ] ftemp4 = np.array(flfeatures) ls4.append(ftemp4) source = np.array(ls).flatten() source = np.append(source, np.array(ls2).flatten()) source = np.append(source, np.array(ls3).flatten()) source = np.append(source, np.array(ls4).flatten()) return source
def generateStage2FFT(frameSize, modFrameSize, modWindowSize, nModFrames, p, fft1matrix, modWin, form="magnitude", nFilts=30, nMFCCs=15): """ This function takes an input spectrogram and returns a tensor modulation spectrum. Note form can be "magnitude", "complex" or "real", which describes what is done to the data resulting from each FFT. Inputs: - frameSize: the step size of the acoustic windows in seconds, form "0.001" - modFrameSiza: the step size of the modulation windows in seconds, form "0.1" - modWindowSize: the modulation window size in seconds, form "1" - nModFrames: the number of modulation frames that the speech signal is broken up into - p: the number of FFT points based on the acoustic window, form "48" - fft1matrix: a 2D matrix size [nFrames x p] that contains the FFT of each acoustic window - modWin: the modulation window as a numpy array - form: a choice of "magnitude", "complex" or "real" which determines the content of fft2matrix Outputs: - q: the number of FFT points based on the modulation window, form "1000" - fft2matrix: a 2D matrix size [nModFrames x (p*q)] that has the flattened modulation spectrum per row - logfft2matrix: the decibel magnitude of fft2matrix """ import math from scipy.fftpack import fft #from scipy.signal.windows import hamming, hann import numpy as np from python_speech_features import fbank, mfcc, ssc import warnings if form == "fbank": q = nFilts elif form == "mfcc": q = nMFCCs else: q = round(modWindowSize / (2 * frameSize) + 1) # After applying Nyquist fft2matrix = np.zeros( (nModFrames, p, q)) if form != "complex" else np.zeros( (nModFrames, p, q), dtype=np.complex_) logfft2matrix = np.zeros((nModFrames, p, q)) scale2 = modWin.sum() for i in range(nModFrames): if i % 100 == 0: print("{:,}".format(i), end="\r") for j in range(p): sigExtract2 = fft1matrix[i*int(modFrameSize/frameSize):i*int(modFrameSize/frameSize)\ +round(modWindowSize/frameSize), j] sigWin2 = fft(sigExtract2 * modWin) / scale2 nfft = 2**math.ceil(np.log2(round(modWindowSize / frameSize))) if form == "magnitude": fft2matrix[i, j, :] = np.absolute(sigWin2)[:q] elif form == "complex": fft2matrix[i, j, :] = sigWin2[:q] elif form == "real": fft2matrix[i, j, :] = np.real(sigWin2)[:q] elif form == "fbank": fft2matrix[i, j, :] = fbank(sigExtract2*modWin, samplerate=round(1/frameSize), winlen=modWindowSize, winstep=modFrameSize, nfilt=nFilts, \ nfft=nfft, lowfreq=0, highfreq=round(1/(2*frameSize)), preemph=0)[0][:q] elif form == "mfcc": fft2matrix[i, j, :] = mfcc(sigExtract2 * modWin, samplerate=round(1 / frameSize), winlen=modWindowSize, winstep=modFrameSize, numcep=nMFCCs, nfilt=nFilts, nfft=nfft, lowfreq=0, highfreq=round(1 / (2 * frameSize)), preemph=0, ceplifter=22, appendEnergy=False)[0][:q] else: print( "Form must be \"magnitude\", \"complex\", \"real\", \"fbank\" or \"mfcc\"." ) if form != "complex": logfft2matrix[i, j, :] = 20 * np.log10(fft2matrix[i, j, :])[:q] else: with warnings.catch_warnings(): warnings.simplefilter("ignore") logfft2matrix[i, j, :] = 10 * np.log10( fft2matrix[i, j, :] * np.conj(fft2matrix[i, j, :]))[:q] print("fft2matrix shape is {}".format(fft2matrix.shape)) if form == "fbank": freqs2 = ssc(np.array(fft1matrix[0, :round(modWindowSize/frameSize)]), samplerate=(1/frameSize), winlen=modWindowSize, winstep=modFrameSize, nfilt=nFilts, nfft=2048, \ lowfreq=0, highfreq=round(1/(2*frameSize)), preemph=0)[0][:q] return q, fft2matrix, logfft2matrix, freqs2 else: return q, fft2matrix, logfft2matrix
error2d(psf_feat, csf_feat) print 'Energy' error1d(psf_energy, csf_energy) print '' print 'logfbank' print '========' psf_feat = psf.logfbank(audio) csf_feat = csf.logfbank(audio) assert (np.shape(psf_feat) == np.shape(csf_feat)) error2d(psf_feat, csf_feat) print '' print 'ssc' print '===' psf_ssc = psf.ssc(audio) csf_ssc = csf.ssc(audio) assert (np.shape(psf_ssc) == np.shape(csf_ssc)) error2d(psf_ssc, csf_ssc) print '' print 'hz2mel' print '======' assert (get_error(psf.hz2mel(8000), csf.hz2mel(8000)) <= acceptable_error) assert (get_error(psf.hz2mel(16000), csf.hz2mel(16000)) <= acceptable_error) assert (get_error(csf.mel2hz(csf.hz2mel(8000)), 8000) <= acceptable_error) print ' ✓' print '' print 'mel2hz' print '======'
points_data1 = floor(data1.shape[0] / fs / t) data0 = data0[:points_data0 * fs * t] data1 = data1[:points_data1 * fs * t] mfcc_0 = mfcc(data0, fs, winlen=t, nfft=t * fs, winstep=t) mfcc_1 = mfcc(data1, fs, winlen=t, nfft=t * fs, winstep=t) mfcc_feat = np.concatenate((mfcc_0, mfcc_1)) # ============================================================================= # fbank_0 = logfbank(data0,fs,winlen=t,nfft=t*fs,winstep=t) # fbank_1 = logfbank(data1,fs,winlen=t,nfft=t*fs,winstep=t) # fbank_feat = np.concatenate((fbank_0,fbank_1)) # ============================================================================= hop = 0.5 sc_feat_0 = ssc(data0, fs, winlen=t, nfft=int((t * fs) / hop), winstep=t) sc_feat_1 = ssc(data1, fs, winlen=t, nfft=int((t * fs) / hop), winstep=t) sc_feat = np.concatenate((sc_feat_0, sc_feat_1)) # ============================================================================= # rms_feat = np.array([]) # points = fs*t # data_ampl = np.abs(np.fft.fft(data0)) # data_ampl = data_ampl[1:] # data_energy = data_ampl ** 2 # energy = np.append(data_energy,data_energy[-1]) # energy = energy.reshape((floor(points),-1)) # rms = librosa.feature.rmse(S=energy) # rms = rms.T # rms_feat = np.append(rms_feat,rms) # data_ampl = np.abs(np.fft.fft(data1))
def create_dataset_csv(csv_dir, test_audio_name='test_audio.wav'): loaded_data = dict() loaded_data['wav'] = [] loaded_data['phoneme'] = [] loaded_data['landmark'] = [] loaded_data['maya_pos'] = [] loaded_data['maya_param'] = [] loaded_data['face_close'] = [] loaded_data['face_open'] = [] loaded_data['pose'] = [] loaded_data['file_len'] = {'train': 0, 'test': 0} loaded_data['clip_len'] = {'train': [], 'test': []} loaded_data['file_dir'] = {'train': [], 'test': []} dataset_type_order = ['test'] csv_dir += test_audio_name[:-4] + '/' try_mkdir(csv_dir) try_mkdir(csv_dir + 'test/') errf = open(csv_dir + 'err.txt', 'w') for dataset_type_i in range(0, 1): # all from train file list dataset_type = dataset_type_order[dataset_type_i] file_list = {'n': 1, 'wav': [lpw_dir + test_audio_name]} for nClip in range(0, file_list['n']): print( '\n==================== Processing file {:} ====================' .format(file_list["wav"][nClip])) if (not os.path.isfile(file_list["wav"][nClip])): print('# ' + str(nClip) + ' None existing file: ' + file_list["wav"][nClip]) errf.write('# ' + str(nClip) + ' None existing file: ' + file_list["wav"][nClip] + '\n') continue # WAV (rate, sig) = wav.read(file_list["wav"][nClip]) if (sig.ndim > 1): sig = sig[:, 0] # pick mono-acoustic track else: print('Notice: ' + file_list["wav"][nClip] + ' is mono-track') # fps = (nLandmark + 1) / (sig.shape[0] / rate) fps = 25 errf.write(file_list["wav"][nClip] + 'FPS: {:} \n'.format(fps)) print('FPS: {:}'.format(fps)) winstep = 1.0 / fps / mfcc_win_step_per_frame / up_sample_rate mfcc_feat = mfcc(sig, samplerate=rate, winlen=0.025, winstep=winstep, numcep=13) logfbank_feat = logfbank(sig, samplerate=rate, winlen=0.025, winstep=winstep, nfilt=26) ssc_feat = ssc(sig, samplerate=rate, winlen=0.025, winstep=winstep, nfilt=26) full_feat = np.concatenate([mfcc_feat, logfbank_feat, ssc_feat], axis=1) # full_feat = logfbank_feat nFrames_represented_by_wav = math.floor( full_feat.shape[0] / mfcc_win_step_per_frame / up_sample_rate) mfcc_lines = full_feat[0:nFrames_represented_by_wav * mfcc_win_step_per_frame * up_sample_rate, :].reshape( int(nFrames_represented_by_wav * up_sample_rate), int(full_feat.shape[1] * mfcc_win_step_per_frame)) ''' # ==================== cut the tail of lpw to make sure they are in same length ==================== # ''' # print("Original length of lpw + maya_param/pos: " + str(nFrames_represented_by_wav)) aligned_length_wav = mfcc_lines ''' # ==================== process each lpw file ==================== # ''' npWav = np.array(aligned_length_wav) print("Load #Clip {:d}/{:}, wav {:}".format( nClip, file_list['n'], npWav.shape)) loaded_data['wav'].append(npWav) # length of each dataset_type loaded_data['file_len'][dataset_type] += npWav.shape[0] loaded_data['clip_len'][dataset_type].append(npWav.shape[0]) loaded_data['file_dir'][dataset_type].append( file_list["wav"][nClip][28:-4] + ' ' + str(loaded_data['file_len'][dataset_type] - npWav.shape[0]) + ' ' + str(npWav.shape[0])) # end for nClip loop # break # end for dataset_type loop # break ''' # ==================== save file ==================== # ''' key_order = ['wav'] for key_i in range(0, 1): key = key_order[key_i] # print(key) # ==================== wav normalize file ==================== # npKey = loaded_data[key][0] for i in range(1, len(loaded_data[key])): npKey = np.concatenate((npKey, loaded_data[key][i]), axis=0) # Use saved std & mean mean_std = np.loadtxt(lpw_dir + 'saved_param/wav_mean_std.csv') npKey_mean = mean_std[0:65] npKey_std = mean_std[65:130] def normal_data(loaded_data, mean, std): normed = (loaded_data - mean) / std return normed npKey = normal_data(npKey, npKey_mean, npKey_std) np.savetxt(csv_dir + key + '_mean_std.csv', np.append(npKey_mean, npKey_std), fmt='%.5f', delimiter=' ') np.savetxt(csv_dir + key + '_raw.csv', npKey, fmt='%.5f', delimiter=' ') del npKey def reshape_based_on_win_size(loaded_data, i, win_size, start_idx): npWav = (loaded_data[i] - npKey_mean) / npKey_std listWav = list(range(start_idx, start_idx + npWav.shape[0])) half_win_size = int(win_size / 2) pad_head = [start_idx for _ in range(half_win_size)] pad_tail = [listWav[-1] for _ in range(half_win_size)] pad_npWav = np.array(pad_head + listWav + pad_tail) npKey = np.zeros(shape=(npWav.shape[0], win_size)) for np_i in range(0, npWav.shape[0]): npKey[np_i] = pad_npWav[np_i:np_i + win_size].reshape( 1, win_size) return npKey npKey = reshape_based_on_win_size(loaded_data['wav'], 0, win_size, 0) for i in range(1, len(loaded_data[key])): npKeytmp = reshape_based_on_win_size(loaded_data['wav'], i, win_size, npKey.shape[0]) npKey = np.concatenate((npKey, npKeytmp), axis=0) idx = 0 for dataset_type_i in range(0, 1): dataset_type = dataset_type_order[dataset_type_i] dataset_type_data_len = loaded_data['file_len'][dataset_type] cur_npKey = npKey[idx:idx + dataset_type_data_len] print('Save {:} - {:} file as shape of {:}'.format( dataset_type, key, cur_npKey.shape)) np.savetxt(csv_dir + dataset_type + '/' + key + '.csv', cur_npKey, fmt='%d', delimiter=' ') idx += dataset_type_data_len for dataset_type in {'test'}: npLen = np.array(loaded_data['clip_len'][dataset_type]) np.savetxt(csv_dir + dataset_type + '/clip_len.csv', npLen, fmt='%d', delimiter=' ') # print("Saved clip length file to " + dataset_type + '/clip_len.csv') npLen = np.array(loaded_data['file_dir'][dataset_type]) np.savetxt(csv_dir + dataset_type + '/file_dir.csv', npLen, fmt='%s', delimiter=' ')
with open(sFList, 'r') as fList: lWavFiles = fList.read().splitlines() for sLine in lWavFiles: sWavFile, sFeatureFile = sLine.split() print(sWavFile) iRate, lSamples = wav.read(sWavFile) print(sWavFile, end='\r') #Ceating Features if sFeatureType == 'mfcc': aFeatures = mfcc(lSamples, iRate) elif sFeatureType == 'fbank': aFeatures = fbank(lSamples, iRate) elif sFeatureType == 'lfbank': aFeatures = logfbank(lSamples, iRate, nfilt=iSize) elif sFeatureType == 'ssc': aFeatures = ssc(lSamples, iRate) else: print('Error: Unknown Feature Type sFeatureType') sys.exit(1) #Computing Time Drivatives if sDrivatives == 'D': aDFeatures = delta(aFeatures, iDeltaWindow) aFeatures = np.c_[aFeatures, aDFeatures] elif sDrivatives == 'A': aDFeatures = delta(aFeatures, iDeltaWindow) aAFeatures = delta(aDFeatures, iAccWindow) aFeatures = np.c_[aFeatures, aDFeatures, aAFeatures] elif sDrivatives == 'T': aDFeatures = delta(aFeatures, iDeltaWindow) aAFeatures = delta(aDFeatures, iAccWindow)
def pspeech_featurize(file): # convert if .mp3 to .wav or it will fail convert = False if file[-4:] == '.mp3': convert = True os.system('ffmpeg -i %s %s' % (file, file[0:-4] + '.wav')) file = file[0:-4] + '.wav' (rate, sig) = wav.read(file) mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) ssc_feat = ssc(sig, rate) one_ = np.mean(mfcc_feat, axis=0) one = get_labels(one_, 'mfcc_', 'means') two_ = np.std(mfcc_feat, axis=0) two = get_labels(one_, 'mfcc_', 'stds') three_ = np.amax(mfcc_feat, axis=0) three = get_labels(one_, 'mfcc_', 'max') four_ = np.amin(mfcc_feat, axis=0) four = get_labels(one_, 'mfcc_', 'min') five_ = np.median(mfcc_feat, axis=0) five = get_labels(one_, 'mfcc_', 'medians') six_ = np.mean(fbank_feat, axis=0) six = get_labels(six_, 'fbank_', 'means') seven_ = np.mean(fbank_feat, axis=0) seven = get_labels(six_, 'fbank_', 'stds') eight_ = np.mean(fbank_feat, axis=0) eight = get_labels(six_, 'fbank_', 'max') nine_ = np.mean(fbank_feat, axis=0) nine = get_labels(six_, 'fbank_', 'min') ten_ = np.mean(fbank_feat, axis=0) ten = get_labels(six_, 'fbank_', 'medians') eleven_ = np.mean(ssc_feat, axis=0) eleven = get_labels(eleven_, 'spectral_centroid_', 'means') twelve_ = np.mean(ssc_feat, axis=0) twelve = get_labels(eleven_, 'spectral_centroid_', 'stds') thirteen_ = np.mean(ssc_feat, axis=0) thirteen = get_labels(eleven_, 'spectral_centroid_', 'max') fourteen_ = np.mean(ssc_feat, axis=0) fourteen = get_labels(eleven_, 'spectral_centroid_', 'min') fifteen_ = np.mean(ssc_feat, axis=0) fifteen = get_labels(eleven_, 'spectral_centroid_', 'medians') labels = one + two + three + four + five + six + seven + eight + nine + ten + eleven + twelve + thirteen + fourteen + fifteen features = np.append(one_, two_) features = np.append(features, three_) features = np.append(features, four_) features = np.append(features, five_) features = np.append(features, six_) features = np.append(features, seven_) features = np.append(features, eight_) features = np.append(features, nine_) features = np.append(features, ten_) features = np.append(features, eleven_) features = np.append(features, twelve_) features = np.append(features, thirteen_) features = np.append(features, fourteen_) features = np.append(features, fifteen_) if convert == True: os.remove(file) print(features.shape) print(len(labels)) return features, labels
def predict(self, rate, sig, group_list): #---------------------------------------------------------------------------# #test #print len(sig) fps = 25 # print('FPS: {:}'.format(fps)) winstep = 1.0 / fps / up_sample_rate mfcc_feat = mfcc(sig, samplerate=rate, winlen=0.025, winstep=winstep, numcep=13) logfbank_feat = logfbank(sig, samplerate=rate, winlen=0.025, winstep=winstep, nfilt=26) ssc_feat = ssc(sig, samplerate=rate, winlen=0.025, winstep=winstep, nfilt=26) full_feat = np.concatenate([mfcc_feat, logfbank_feat, ssc_feat], axis=1) # full_feat = logfbank_feat aligned_length_wav = full_feat npWav = np.array(aligned_length_wav) n_samples = len(npWav) # normalize wav-raw mean, std = np.loadtxt('utl/mean_std.txt') wav_raw = npWav wav_raw = (wav_raw - mean) / std #wav_raw = wav_raw[:, sel_id] # grouping x = list() x.append(wav_raw) n_batch = 1 n_sample_needed = n_batch * batch_size - len(x) x += [x[-1] for _ in range(n_sample_needed)] x = np.array(x) #print x.shape state_test = self.sess.run(self.initial_state) batch_x = x seq_len = np.array([n_steps] + [0] * (batch_size - 1)) feed = { self.x: batch_x, self.phase: False, self.dropout: 0, #self.batch_size: batch_size, self.initial_state: state_test, self.seq_len: seq_len } batch_y, _ = self.sess.run([self.pred, self.final_state], feed_dict=feed) y = batch_y[0] y = np.array(y) # np.savetxt("prediction/{}.txt".format(step),y) id = np.argmax(y) phonemes = group_list[id] #print phonemes #consider the up sample rate return phonemes
def featurex(filepath): (X, rate) = librosa.load(filepath, sr=48000) ceps = mfcc(X, rate, nfft=2048) delt = delta(ceps, 2) sscz = ssc(X, rate, nfft=2048) filt = delta(delt, 2) #zeroo=delta(sscz,2) #librosa.feature.zero_crossing_rate(X,rate) # zeroo=zeroo.reshape((zeroo.shape[1],zeroo.shape[0])) ls = [] for i in range(ceps.shape[1]): temp = ceps[:, i] lfeatures = [ np.mean(temp), np.var(temp), np.amax(temp), np.amin(temp), scipy.stats.kurtosis(temp), scipy.stats.skew(temp), scipy.stats.iqr(temp) ] temp2 = np.array(lfeatures) ls.append(temp2) ls2 = [] for i in range(delt.shape[1]): dtemp = delt[:, i] dlfeatures = [ np.mean(dtemp), np.var(dtemp), np.amax(dtemp), np.amin(dtemp), scipy.stats.kurtosis(dtemp), scipy.stats.skew(dtemp), scipy.stats.iqr(dtemp) ] dtemp2 = np.array(dlfeatures) ls2.append(dtemp2) ls3 = [] for i in range(sscz.shape[1]): stemp = sscz[:, i] slfeatures = [ np.mean(stemp), np.var(stemp), np.amax(stemp), np.amin(stemp), scipy.stats.kurtosis(stemp), scipy.stats.skew(stemp), scipy.stats.iqr(stemp) ] stemp3 = np.array(slfeatures) ls3.append(stemp3) ls4 = [] for i in range(filt.shape[1]): ftemp = filt[:, i] flfeatures = [ np.mean(ftemp), np.var(ftemp), np.amax(ftemp), np.amin(ftemp), scipy.stats.kurtosis(ftemp), scipy.stats.skew(ftemp), scipy.stats.iqr(ftemp) ] ftemp4 = np.array(flfeatures) ls4.append(ftemp4) source = np.array(ls).flatten() source = np.append(source, np.array(ls2).flatten()) source = np.append(source, np.array(ls3).flatten()) source = np.append(source, np.array(ls4).flatten()) # source = np.append(source, np.array(ls5).flatten()) return source
def generateStage1FFT(fs, sig, frameSize, windowSize, nFrames, acWin, form="magnitude", phases=False, nFilts=40, nMFCCs=19): """ This function takes an input speech signal and returns the STFT. Specify form="magnitude" to generate Hilbert transform later - the Hilbert transform only works on real signals. Note the scaling has been added as that is in SciPy STFT and seems necessary for signal reconstruction. Inputs: - fs: the sampling frequency of the speech files, form "16000" - sig: the speech signal to analyse - frameSize: the step size of the acoustic windows in seconds, form "0.001" - windowSize: the acoustic window size in seconds, form "0.03" - nFrames: the number of acoustic frames that the speech signal is broken up into - acWin: the acoustic window as a numpy array - form: a choice of "magnitude", "complex" or "real" which determines the content of fft1matrix - phases: whether to return the phase information as a separate matrix, form "True" Outputs: - p: the number of FFT points based on the acoustic window, form "48" - fft1matrix: a 2D matrix size [nFrames x p] that contains the FFT of each acoustic window - fft1matrixphases: the phase of each element, only returned if phases = True """ import math from scipy.fftpack import fft #from scipy.signal.windows import hamming, hann import numpy as np from python_speech_features import fbank, mfcc, ssc if form == "fbank": p = nFilts elif form == "mfcc": p = nMFCCs else: p = round( windowSize * fs / 2 + 1 ) # p is the number of acoustic frequency bins afters limiting with Nyquist fft1matrix = np.zeros((nFrames, p)) if form != "complex" else np.zeros( (nFrames, p), dtype=np.complex_) scale1 = acWin.sum() if phases == True: fft1matrixphases = np.zeros((nFrames, p)) for i in range(nFrames): sigExtract = np.array( sig[i * round(frameSize * fs):i * round(frameSize * fs) + round(windowSize * fs)]) # Ignoring pre-emphasis filter here sigWin = fft(sigExtract * acWin) / scale1 nfft = 2**math.ceil(np.log2(round(windowSize * fs))) if len(sigExtract) == round(windowSize * fs): if form == "magnitude": fft1matrix[i, :] = np.absolute(sigWin)[:p] elif form == "complex": fft1matrix[i, :] = sigWin[:p] elif form == "real": fft1matrix[i, :] = np.real(sigWin)[:p] elif form == "fbank": fft1matrix[i, :] = fbank(sigExtract*acWin, samplerate=fs, winlen=windowSize, winstep=frameSize, nfilt=nFilts, \ nfft=nfft, lowfreq=0, highfreq=int(fs/2), preemph=0)[0][:p] elif form == "mfcc": fft1matrix[i, :] = mfcc(sigExtract * acWin, samplerate=fs, winlen=windowSize, winstep=frameSize, numcep=nMFCCs, nfilt=nFilts, nfft=nfft, lowfreq=0, highfreq=int(fs / 2), preemph=0, ceplifter=22, appendEnergy=False)[0][:p] else: print( "Form must be \"magnitude\", \"complex\", \"real\", \"fbank\" or \"mfcc\"." ) if phases == True: fft1matrixphases[i, :] = np.angle(fft(sigExtract * acWin))[:p] print("fft1matrix shape is {}".format(fft1matrix.shape)) if phases == True: return p, fft1matrix, fft1matrixphases elif form == "fbank": freqs1 = ssc(np.array(sig[:round(windowSize*fs)]), samplerate=fs, winlen=windowSize, winstep=frameSize, nfilt=40, nfft=512, \ lowfreq=0, highfreq=round(fs/2), preemph=0)[0][:p] return p, fft1matrix, freqs1 else: return p, fft1matrix