Python cmvn Examples, speechpy.processing.cmvn Python Examples

Example #1

0

Show file

 def test_cmvn(self):
     
     feature_vector = np.random.rand(50,100)
     normalized_feature = processing.cmvn(feature_vector, variance_normalization=True)
     
     # Shape match
     assert normalized_feature.shape == feature_vector.shape
     
     # Check the std and mean of the output vector
     assert np.allclose(np.mean(normalized_feature,axis=0), np.zeros((1,normalized_feature.shape[1])))
     assert np.allclose(np.std(normalized_feature,axis=0), np.ones((1,normalized_feature.shape[1])))

Example #2

0

Show file

File: audio_processing.py Project: Wenhao-Yang/SpeakerVerifiaction-pytorch

def compute_fbank_feat(filename, nfilt=c.FILTER_BANK, use_logscale=c.USE_LOGSCALE, use_energy=True, add_energy=True, normalize=c.CMVN, vad=c.VAD):
    """
    Making feats more like in kaldi.

    :param filename:
    :param use_delta:
    :param nfilt:
    :param use_logscale:
    :param use_energy:
    :param normalize:
    :return:
    """

    if not os.path.exists(filename):
        raise ValueError('Wav file does not exist.')

    sample_rate, audio = wavfile.read(filename)
    pad_size = np.ceil((len(audio) - 0.025 * sample_rate) / (0.01 * sample_rate)) * 0.01 * sample_rate - len(audio) + 0.025 * sample_rate

    audio = np.lib.pad(audio, (0, int(pad_size)), 'symmetric')

    filter_banks, energies = mfe(audio, sample_rate, frame_length=0.025, frame_stride=0.01, num_filters=nfilt, fft_length=512, low_frequency=0, high_frequency=None)

    if use_energy:
        if add_energy:
            # Add an extra dimension to features
            energies = energies.reshape(energies.shape[0], 1)
            filter_banks = np.concatenate((energies, filter_banks), axis=1)
        else:
            # replace the 1st dim as energy
            energies = energies.reshape(energies.shape[0], 1)
            filter_banks[:, 0]=energies[:, 0]

    if use_logscale:
        filter_banks = np.log(np.maximum(filter_banks, 1e-5))
        # filter_banks = np.log(filter_banks)

    if normalize=='cmvn':
        # vec(array): input_feature_matrix (size:(num_observation, num_features))
        norm_fbank = cmvn(vec=filter_banks, variance_normalization=True)
    elif normalize=='cmvnw':
        norm_fbank = cmvnw(vec=filter_banks, win_size=301, variance_normalization=True)

    if use_energy and vad:
        voiced = []
        ComputeVadEnergy(filter_banks, voiced)
        voiced = np.array(voiced)
        voiced_index = np.argwhere(voiced==1).squeeze()
        norm_fbank = norm_fbank[voiced_index]

        return norm_fbank, voiced

    return norm_fbank

Example #3

0

Show file

File: datafeeder.py Project: zqs01/Speech-Transformer-tf2.0

def compute_fbank(file, debug=True):
    sr, signal = wav.read(file)
    if debug:
        print('signal shape: ', signal.shape)
    # Pre-emphasizing.
    signal_preemphasized = processing.preemphasis(signal,
                                                  cof=data_config.preemphasis)
    # Stacking frames
    frames = processing.stack_frames(signal_preemphasized,
                                     sampling_frequency=sr,
                                     frame_length=data_config.window_size,
                                     frame_stride=data_config.hop_size,
                                     zero_padding=True)

    # Extracting power spectrum
    power_spectrum = processing.power_spectrum(
        frames, fft_points=512)  # num_frames x fft_length
    if debug:
        print('power spectrum shape=', power_spectrum.shape)

    ############# Extract fbanks features #############
    log_fbank = feature.lmfe(signal_preemphasized,
                             sampling_frequency=sr,
                             frame_length=data_config.window_size,
                             frame_stride=data_config.hop_size,
                             num_filters=data_config.num_mels,
                             fft_length=512,
                             low_frequency=0,
                             high_frequency=None)  # num_frames x num_filters

    if data_config.apply_cmvn:
        # Cepstral mean variance normalization.
        log_fbank_cmvn = processing.cmvn(log_fbank,
                                         variance_normalization=True)
        if debug:
            print('fbank(mean + variance normalized) feature shape=',
                  log_fbank_cmvn.shape)
        log_fbank = log_fbank_cmvn  # num_frames x num_filters

    # Extracting derivative features
    log_fbank = feature.extract_derivative_feature(log_fbank)
    # print('log fbank feature cube shape=', log_fbank_feature_cube.shape) # num_frames x num_filters x 3

    # frameSlice and dowmSampling
    # concat_mat = concat_frame(log_fbank)
    # log_fbank = subsampling(concat_mat)
    # log_fbank = build_LFR_features(log_fbank, data_config.LFR_m, data_config.LFR_n)
    if debug:
        print('concat & subsample shape=', log_fbank.shape)

    return log_fbank

Example #4

0

Show file

# Extracting power spectrum
power_spectrum = processing.power_spectrum(frames, fft_points=512)
print('power spectrum shape=', power_spectrum.shape)

############# Extract MFCC features #############
mfcc = feature.mfcc(signal,
                    sampling_frequency=fs,
                    frame_length=0.020,
                    frame_stride=0.01,
                    num_filters=40,
                    fft_length=512,
                    low_frequency=0,
                    high_frequency=None)

# Cepstral mean variance normalization.
mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True)
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)

# Extracting derivative features
mfcc_feature_cube = feature.extract_derivative_feature(mfcc)
print('mfcc feature cube shape=', mfcc_feature_cube.shape)

############# Extract logenergy features #############
logenergy = feature.lmfe(signal,
                         sampling_frequency=fs,
                         frame_length=0.020,
                         frame_stride=0.01,
                         num_filters=40,
                         fft_length=512,
                         low_frequency=0,
                         high_frequency=None)

Example #5

0

Show file

File: voice_classifier.py Project: TranTony/voicerecognition

                         frame_stride=0.01,
                         num_filters=40,
                         fft_length=512,
                         low_frequency=0,
                         high_frequency=None)
mfcc_feat2 = feature.mfcc(signal2,
                          sampling_frequency=fs,
                          frame_length=0.020,
                          frame_stride=0.01,
                          num_filters=40,
                          fft_length=512,
                          low_frequency=0,
                          high_frequency=None)

# Cepstral mean variance normalization.
mfcc_cmvn = processing.cmvn(mfcc_feat, variance_normalization=True)

mfcc_cmvn2 = processing.cmvn(mfcc_feat2, variance_normalization=True)

# Import the StandardScaler
from sklearn.preprocessing import StandardScaler

# Scale the features and set the values to a new variable
scaler = StandardScaler()
scaled_train_features = scaler.fit_transform(mfcc_feat)
scaled_train_features2 = scaler.fit_transform(mfcc_feat2)

# Get our explained variance ratios from PCA using all features
#pca = PCA()
#pca.fit(scaled_train_features)
n_components = 6