def test_cmvn(self): feature_vector = np.random.rand(50,100) normalized_feature = processing.cmvn(feature_vector, variance_normalization=True) # Shape match assert normalized_feature.shape == feature_vector.shape # Check the std and mean of the output vector assert np.allclose(np.mean(normalized_feature,axis=0), np.zeros((1,normalized_feature.shape[1]))) assert np.allclose(np.std(normalized_feature,axis=0), np.ones((1,normalized_feature.shape[1])))
def compute_fbank_feat(filename, nfilt=c.FILTER_BANK, use_logscale=c.USE_LOGSCALE, use_energy=True, add_energy=True, normalize=c.CMVN, vad=c.VAD): """ Making feats more like in kaldi. :param filename: :param use_delta: :param nfilt: :param use_logscale: :param use_energy: :param normalize: :return: """ if not os.path.exists(filename): raise ValueError('Wav file does not exist.') sample_rate, audio = wavfile.read(filename) pad_size = np.ceil((len(audio) - 0.025 * sample_rate) / (0.01 * sample_rate)) * 0.01 * sample_rate - len(audio) + 0.025 * sample_rate audio = np.lib.pad(audio, (0, int(pad_size)), 'symmetric') filter_banks, energies = mfe(audio, sample_rate, frame_length=0.025, frame_stride=0.01, num_filters=nfilt, fft_length=512, low_frequency=0, high_frequency=None) if use_energy: if add_energy: # Add an extra dimension to features energies = energies.reshape(energies.shape[0], 1) filter_banks = np.concatenate((energies, filter_banks), axis=1) else: # replace the 1st dim as energy energies = energies.reshape(energies.shape[0], 1) filter_banks[:, 0]=energies[:, 0] if use_logscale: filter_banks = np.log(np.maximum(filter_banks, 1e-5)) # filter_banks = np.log(filter_banks) if normalize=='cmvn': # vec(array): input_feature_matrix (size:(num_observation, num_features)) norm_fbank = cmvn(vec=filter_banks, variance_normalization=True) elif normalize=='cmvnw': norm_fbank = cmvnw(vec=filter_banks, win_size=301, variance_normalization=True) if use_energy and vad: voiced = [] ComputeVadEnergy(filter_banks, voiced) voiced = np.array(voiced) voiced_index = np.argwhere(voiced==1).squeeze() norm_fbank = norm_fbank[voiced_index] return norm_fbank, voiced return norm_fbank
def compute_fbank(file, debug=True): sr, signal = wav.read(file) if debug: print('signal shape: ', signal.shape) # Pre-emphasizing. signal_preemphasized = processing.preemphasis(signal, cof=data_config.preemphasis) # Stacking frames frames = processing.stack_frames(signal_preemphasized, sampling_frequency=sr, frame_length=data_config.window_size, frame_stride=data_config.hop_size, zero_padding=True) # Extracting power spectrum power_spectrum = processing.power_spectrum( frames, fft_points=512) # num_frames x fft_length if debug: print('power spectrum shape=', power_spectrum.shape) ############# Extract fbanks features ############# log_fbank = feature.lmfe(signal_preemphasized, sampling_frequency=sr, frame_length=data_config.window_size, frame_stride=data_config.hop_size, num_filters=data_config.num_mels, fft_length=512, low_frequency=0, high_frequency=None) # num_frames x num_filters if data_config.apply_cmvn: # Cepstral mean variance normalization. log_fbank_cmvn = processing.cmvn(log_fbank, variance_normalization=True) if debug: print('fbank(mean + variance normalized) feature shape=', log_fbank_cmvn.shape) log_fbank = log_fbank_cmvn # num_frames x num_filters # Extracting derivative features log_fbank = feature.extract_derivative_feature(log_fbank) # print('log fbank feature cube shape=', log_fbank_feature_cube.shape) # num_frames x num_filters x 3 # frameSlice and dowmSampling # concat_mat = concat_frame(log_fbank) # log_fbank = subsampling(concat_mat) # log_fbank = build_LFR_features(log_fbank, data_config.LFR_m, data_config.LFR_n) if debug: print('concat & subsample shape=', log_fbank.shape) return log_fbank
# Extracting power spectrum power_spectrum = processing.power_spectrum(frames, fft_points=512) print('power spectrum shape=', power_spectrum.shape) ############# Extract MFCC features ############# mfcc = feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) # Cepstral mean variance normalization. mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True) print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) # Extracting derivative features mfcc_feature_cube = feature.extract_derivative_feature(mfcc) print('mfcc feature cube shape=', mfcc_feature_cube.shape) ############# Extract logenergy features ############# logenergy = feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) mfcc_feat2 = feature.mfcc(signal2, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) # Cepstral mean variance normalization. mfcc_cmvn = processing.cmvn(mfcc_feat, variance_normalization=True) mfcc_cmvn2 = processing.cmvn(mfcc_feat2, variance_normalization=True) # Import the StandardScaler from sklearn.preprocessing import StandardScaler # Scale the features and set the values to a new variable scaler = StandardScaler() scaled_train_features = scaler.fit_transform(mfcc_feat) scaled_train_features2 = scaler.fit_transform(mfcc_feat2) # Get our explained variance ratios from PCA using all features #pca = PCA() #pca.fit(scaled_train_features) n_components = 6