def centerlized_mfcc(lowFreq=0, middleFreq=4800, highFreq=8000):
    #TODO: Make the numbers more general instead of the hardcoded that they are
    fb1 = get_filterbanks(nfilt=15,
                          nfft=512,
                          samplerate=16000,
                          lowfreq=0,
                          highfreq=middleFreq)
    fb2 = get_filterbanks(nfilt=11,
                          nfft=512,
                          samplerate=16000,
                          lowfreq=0,
                          highfreq=(highFreq - middleFreq))
    fb = numpy.zeros(257)

    for filter in fb1:
        newFilter = filter[0:150]
        newFilter = newFilter[::-1]
        zeros = numpy.zeros(len(filter) - 150)
        newFilter = numpy.concatenate((newFilter, zeros), axis=0)
        fb = numpy.vstack((fb, newFilter))

    for filter in fb2:
        zeros = numpy.zeros(146)
        newFilter = filter[0:111]
        newFilter = numpy.concatenate((zeros, newFilter), axis=0)
        fb = numpy.vstack((fb, newFilter))

    return fb[
        1:
        27]  #its time to give up the first element (the zeros)... I used it only for the initial structure
Exemple #2
0
def mel_filterbank_callback(testing, map, iteration, context):
    size, num_filters, sample_rate = context
    try:
        from python_speech_features import get_filterbanks
    except:
        print("### skiping test_mel_filterbank because 'python_speech_features' module is not available")
        return

    fbanks = get_filterbanks(num_filters, size, sample_rate)
    input = np.array(range(size)).astype(np.float)

    chopped = input[0:fbanks.shape[1]]
    expected = np.dot(chopped, fbanks.T)

    output = map.Compute(input)
    testing.ProcessTest("test_mel_filterbank compute iteration {}".format(iteration), np.allclose(output, expected))

    compiler_settings = ell.model.MapCompilerOptions()
    compiler_settings.useBlas = False  # not resolvable on our Linux test machines...
    optimizer_options = ell.model.ModelOptimizerOptions()
    compiled_map = map.Compile("host", "hammingtest", "predict", compiler_settings, optimizer_options)

    compiled_output = compiled_map.Compute(input)
    testing.ProcessTest("test_mel_filterbank compiled iteration {}".format(iteration),
                        np.allclose(compiled_output, expected))
    return compiled_output
Exemple #3
0
def mel_bankm(fs, nfft, mel_num, fmin=0.0, fmax=None):
    # bank = filters.mel(sr=fs, n_fft=nfft, n_mels=mel_num, fmin=fmin, fmax=fmax, norm=None)
    bank = get_filterbanks(nfilt=mel_num,
                           nfft=nfft,
                           samplerate=fs,
                           lowfreq=fmin,
                           highfreq=fmax)
    return bank
Exemple #4
0
def filterbanks(sample_rate, nfilt, nfft, fft_bins_2_freq):
    
    fb = get_filterbanks(nfilt, nfft, sample_rate)
    
    for filter in fb:
        plt.plot(fft_bins_2_freq, filter)

    plt.ylabel('Skaalausarvo')
    plt.xlabel('Taajuus (Hz)')
    plt.show()
def fbank_from_complex_spec(complex_spec,
                            nfilt=64,
                            nfft=512,
                            sample_rate=16000):
    import python_speech_features
    power = 1 / nfft * np.square(complex_spec).real
    fb = python_speech_features.get_filterbanks(nfilt, nfft, sample_rate)
    feat = np.dot(power, fb.T)
    feat = np.where(feat == 0, np.finfo(float).eps, feat)
    return feat.astype('float32')
Exemple #6
0
def mfcc(frames,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True):

    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log

    fb = python_speech_features.get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
    feat = numpy.log(feat)
    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
    feat = python_speech_features.lifter(feat,ceplifter)
    if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
    return feat
Exemple #7
0
    def _transform(self, spectrogram):
        '''Transform STFT features into log mel-frequency filterbank features'''
        # handle multiple spectrograms at once
        spec_dim = len(spectrogram.shape)
        # Move time and frequency to the end of dim list,
        # if there are other dimensions
        if spec_dim > 2:
            new_shape = [*range(2, spec_dim), 0, 1]
            spectrogram = np.transpose(spectrogram, new_shape)
        # freely adapted from python_speech_features logfbank
        fb = psf.get_filterbanks(self.num_filters, self.num_fft,
                                 self.sample_rate, 0, self.sample_rate / 2)
        mag_spec = np.absolute(spectrogram)
        pow_spec = 1.0 / self.num_fft * np.square(mag_spec)
        energies = np.dot(pow_spec, fb.T)  # compute the filterbank energies
        energies = np.where(energies == 0, np.finfo(float).eps, energies)
        log_energies = np.log(energies)
        # Move time and frequency back to the start of the dim list
        if spec_dim > 2:
            old_shape = [
                spec_dim - 2, spec_dim - 1, *list(range(spec_dim - 2))
            ]
            print(old_shape)
            log_energies = np.transpose(log_energies, old_shape)

        # get diff features
        if self.diff_features:
            first_diff = np.diff(log_energies, 1, axis=0)
            # Zero-pad beginning
            first_diff = np.concatenate(
                (np.zeros((1, *first_diff.shape[1:]),
                          dtype=log_energies.dtype), first_diff),
                axis=0)
            second_diff = np.diff(log_energies, 2, axis=0)
            second_diff = np.concatenate(
                (np.zeros((2, *second_diff.shape[1:]),
                          dtype=log_energies.dtype), second_diff),
                axis=0)
            # Concatenate in frequency
            # This is a weird thing to do, not sure if would be better to
            # have on its own dimension
            log_energies = np.concatenate(
                (log_energies, first_diff, second_diff), axis=1)
        return log_energies
Exemple #8
0
from audio import get_features
from audio import pitch_conversion
from audio import mfe2sp
from constants import MODEL
from constants import SAMPLE_RATE
from constants import EMB_FRAMES
from constants import NUM_FBANKS
from constants import embedder_model
from constants import PREEMPH
from constants import FRAME_PERIOD
from constants import NFFT
import embedder.embedding_model as embedding_model
import embedder.embedder_utils as ut
from audio import quantize

fb = get_filterbanks(NUM_FBANKS, NFFT, SAMPLE_RATE, lowfreq=0, highfreq=None)
filter_centers = fb.argmax(axis=-1)
preemph_transform = np.abs(fft([1., -PREEMPH] + [0] * (NFFT - 2)))[:NFFT // 2 +
                                                                   1]**2


def convert_voice(model, wav_s, wav_t, emb_s, emb_t):
    """Arguments:
    cvae - ACVAE model
    embedder - DeepSpeakerModel
    wav_s - source voice
    wav_t - target voice
    Returns: 
    wav file with words from source voice, voice from target voice
    """
    pic_dir = "../figure/"
Exemple #9
0
# test power spectrum (the first one at least)
powspec = psf.sigproc.powspec(frames, 2048)
results['powspec'] = powspec[0].tolist()

# test filterbank
hz2mel = lambda hz: 2595 * np.log10(1 + hz / 700.)
mel2hz = lambda mel: 700 * (10**(mel / 2595.0) - 1)
highMel = hz2mel(sampleRate / 2)
#print("highMel = %s" % highMel)
melpoints = np.linspace(0, highMel, 26 + 2)
results['bins'] = np.floor(
    (2048 + 1) * mel2hz(melpoints) / sampleRate).tolist()
filterbank = psf.get_filterbanks(nfilt=26,
                                 nfft=2048,
                                 samplerate=sampleRate,
                                 lowfreq=0,
                                 highfreq=None)
results['filters'] = filterbank.tolist()
feat, energy = psf.fbank(pcm, sampleRate, mfccWinlen, mfccStepT, 26, 2048, 0,
                         None, 0.97, winfunc)
results['feat'] = feat.tolist()
results['energy'] = energy.tolist()

# test dct
results['dct'] = dct(np.log(feat), type=2, axis=1,
                     norm='ortho')[:, :mfccNceps].tolist()

# test mfcc
ceps = psf.mfcc(pcm,
                samplerate=sampleRate,
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from python_speech_features import get_filterbanks, hz2mel
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

nfilt, nfft, samplerate, lowfreq, highfreq = 7, 512, 16000, 0, 8000
fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
colors = sns.cubehelix_palette(7, start=2, rot=0, dark=0.1, light=.7)

x = np.arange(0, 8001, 1)
y = [hz2mel(i) for i in x]

ax1.scatter(1000, 1000, s=30, color='red', alpha=0.9)
ax1.vlines(1000,
           ymin=0,
           ymax=1000,
           alpha=0.8,
           color='red',
           linestyle='--',
           linewidth=1)
ax1.hlines(1000,
           xmin=0,
           xmax=1000,
           alpha=0.8,
           color='red',
           linestyle='--',
           linewidth=1)
Exemple #11
0
assert (get_error(psf.hz2mel(16000), csf.hz2mel(16000)) <= acceptable_error)
assert (get_error(csf.mel2hz(csf.hz2mel(8000)), 8000) <= acceptable_error)
print ' ✓'

print ''
print 'mel2hz'
print '======'
assert (get_error(psf.mel2hz(2595), csf.mel2hz(2595)) <= acceptable_error)
assert (get_error(csf.mel2hz(5190), csf.mel2hz(5190)) <= acceptable_error)
assert (get_error(csf.hz2mel(csf.mel2hz(2595)), 2595) <= acceptable_error)
print ' ✓'

print ''
print 'get_filterbanks'
print '==============='
psf_filterbanks = psf.get_filterbanks()
csf_filterbanks = csf.get_filterbanks()
assert (np.shape(psf_filterbanks) == np.shape(csf_filterbanks))
error2d(psf_filterbanks, csf_filterbanks)

print ''
print 'lifter'
print '======'
psf_lifter = psf.lifter(psf_feat)
csf_lifter = csf.lifter(np.array(psf_feat, dtype=np.float32))
assert (np.shape(psf_lifter) == np.shape(csf_lifter))
error2d(psf_lifter, csf_lifter)

print ''
print 'delta'
print '====='
Exemple #12
0
import argparse
import librosa
import numpy as np
from tqdm import tqdm
from os.path import join, isfile
from joblib import Parallel, delayed
from python_speech_features import get_filterbanks, sigproc

samplerate = 16000
nfft = 512
winlen = 0.025 * samplerate
winstep = 0.01 * samplerate
banks = get_filterbanks(40, nfft, samplerate).transpose()


def job(input_name, output_name):
    audio, _ = librosa.load(input_name, mono=True, sr=samplerate)
    if len(audio) == 0:
        return False
    signal = sigproc.preemphasis(audio, 0.97)
    x = sigproc.framesig(signal, winlen, winstep, np.hanning)
    if len(x) == 0:
        return False
    x = sigproc.powspec(x, nfft)
    x = np.dot(x, banks)
    x = np.where(x == 0, np.finfo(float).eps, x)
    x = np.log(x).astype(dtype=np.float32)
    if np.isnan(np.sum(x)):
        return False
    np.save(output_name, x)
    return True