def get_mel_scale(nfilt=20, samplerate=16000, lowfreq=20, highfreq=8000): highfreq = highfreq or samplerate / 2 assert highfreq <= samplerate / 2, "highfreq is greater than samplerate/2" # compute points evenly spaced in mels lowmel = hz2mel(lowfreq) highmel = hz2mel(highfreq) melpoints = np.linspace(lowmel, highmel, nfilt + 2) return melpoints
def __init__(self, input_dim, sr, num_filter, exp=False, filter_fix=False): super(fBPLayer, self).__init__() self.input_dim = input_dim self.num_filter = num_filter self.sr = sr self.exp = exp self.filter_fix = filter_fix requires_grad = not filter_fix input_freq = np.linspace(0, self.sr / 2, input_dim) self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(), requires_grad=False) borders = np.linspace(0, hz2mel(sr / 2), num_filter + 2) borders = mel2hz(borders) self.bandwidth_low = nn.Parameter(torch.from_numpy(borders[:-2]).float().reshape(num_filter, 1), requires_grad=requires_grad) self.bandwidth = nn.Parameter(torch.from_numpy(borders[2:] - borders[:-2]).float().reshape(num_filter, 1), requires_grad=requires_grad)
def __init__(self, input_dim, sr, num_filter, exp=False, filter_fix=False): super(fBLayer, self).__init__() self.input_dim = input_dim self.num_filter = num_filter self.sr = sr self.exp = exp self.filter_fix = filter_fix requires_grad = not filter_fix input_freq = np.linspace(0, self.sr / 2, input_dim) self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(), requires_grad=False) centers = np.linspace(0, hz2mel(sr / 2), num_filter + 2) centers = mel2hz(centers) bandwidth = np.diff(centers) self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1), requires_grad=requires_grad) self.bandwidth_left = nn.Parameter(torch.from_numpy(bandwidth[:-1]).float().reshape(num_filter, 1), requires_grad=requires_grad) self.bandwidth_right = nn.Parameter(torch.from_numpy(bandwidth[1:]).float().reshape(num_filter, 1), requires_grad=requires_grad)
def read_wav(wav_path, feature_type='logmelfbank', batch_size=1): """Read wav file & convert to MFCC or log mel filterbank features. Args: wav_path: path to a wav file feature: logmelfbank or mfcc Returns: inputs: `[batch_size, max_time, feature_dim]` inputs_seq_len: `[batch_size, frame_num]` """ # Load wav file fs, audio = scipy.io.wavfile.read(wav_path) if feature_type == 'mfcc': features = mfcc(audio, samplerate=fs) # `[291, 13]` elif feature_type == 'logmelfbank': fbank_features, energy = fbank(audio, nfilt=40) logfbank = np.log(fbank_features) logenergy = np.log(energy) logmelfbank = hz2mel(logfbank) features = np.c_[logmelfbank, logenergy] # `[291, 41]` delta1 = delta(features, N=2) delta2 = delta(delta1, N=2) input_data = np.c_[features, delta1, delta2] # `[291, 123]` # Transform to 3D array # `[1, 291, 39]` or `[1, 291, 123]` inputs = np.zeros((batch_size, input_data.shape[0], input_data.shape[1])) for i in range(batch_size): inputs[i] = input_data inputs_seq_len = [inputs.shape[1]] * batch_size # `[291]` # Normalization inputs = (inputs - np.mean(inputs)) / np.std(inputs) return inputs, inputs_seq_len
def get_filterbanks(nfilt=20, nfft=512, samplerate=16000, lowfreq=0, highfreq=None, filtertype='mel', multi_weight=False): """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. :param lowfreq: lowest band edge of mel filters, default 0 Hz :param highfreq: highest band edge of mel filters, default samplerate/2 :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. """ highfreq = highfreq or samplerate / 2 assert highfreq <= samplerate / 2, "highfreq is greater than samplerate/2" if filtertype == 'mel': # compute points evenly spaced in mels lowmel = hz2mel(lowfreq) highmel = hz2mel(highfreq) melpoints = np.linspace(lowmel, highmel, nfilt + 2) # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number bin = np.floor((nfft + 1) * mel2hz(melpoints) / samplerate) elif filtertype == 'amel': # compute points evenly spaced in mels lowmel = hz2amel(lowfreq) highmel = hz2amel(highfreq) melpoints = np.linspace(lowmel, highmel, nfilt + 2) # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number bin = np.floor((nfft + 1) * amel2hz(melpoints) / samplerate) elif filtertype == 'linear': linearpoints = np.linspace(lowfreq, highfreq, nfilt + 2) # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number bin = np.floor((nfft + 1) * linearpoints / samplerate) elif filtertype.startswith('dnn'): x = np.arange(0, 161) * samplerate / 2 / 160 if filtertype.endswith('timit.fix'): y = np.array(c.TIMIT_FIlTER_FIX) elif filtertype.endswith('timit.var'): y = np.array(c.TIMIT_FIlTER_VAR) elif filtertype.endswith('timit.mdv'): y = np.array(c.TIMIT_FIlTER_MDV) elif filtertype.endswith('libri.fix'): y = np.array(c.LIBRI_FILTER_FIX) elif filtertype.endswith('libri.var'): y = np.array(c.LIBRI_FILTER_VAR) elif filtertype.endswith('vox1.soft'): y = np.array(c.VOX_FILTER_SOFT) elif filtertype == 'dnn.vox1': y = np.array(c.VOX_FILTER) f = interpolate.interp1d(x, y) x_new = np.arange(nfft // 2 + 1) * samplerate / 2 / (nfft // 2) lowfreq_idx = np.where(x_new >= lowfreq)[0] highfreq_idx = np.where(x_new <= highfreq)[0] ynew = f(x_new) # 计算插值结果 ynew[:int(lowfreq_idx[0])] = 0 if highfreq_idx[-1] < len(x_new) - 1: ynew[int(highfreq[-1] + 1):] = 0 weight = ynew / np.sum(ynew) bin = [] bin.append(lowfreq_idx[0]) for j in range(nfilt): num_wei = 0. for i in range(nfft // 2 + 1): num_wei += weight[i] if num_wei > (j + 1) / (nfilt + 1): bin.append(i - 1) break else: continue bin.append(highfreq_idx[-1]) fbank = np.zeros([nfilt, nfft // 2 + 1]) for j in range(0, nfilt): for i in range(int(bin[j]), int(bin[j + 1])): fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j]) for i in range(int(bin[j + 1]), int(bin[j + 2])): fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1]) if multi_weight: y = np.array(c.TIMIT_FIlTER_VAR) fbank = fbank * (y / y.max()) return fbank
import numpy as np import seaborn as sns from matplotlib import pyplot as plt from python_speech_features import get_filterbanks, hz2mel plt.rc('text', usetex=True) plt.rc('font', family='serif') nfilt, nfft, samplerate, lowfreq, highfreq = 7, 512, 16000, 0, 8000 fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3)) colors = sns.cubehelix_palette(7, start=2, rot=0, dark=0.1, light=.7) x = np.arange(0, 8001, 1) y = [hz2mel(i) for i in x] ax1.scatter(1000, 1000, s=30, color='red', alpha=0.9) ax1.vlines(1000, ymin=0, ymax=1000, alpha=0.8, color='red', linestyle='--', linewidth=1) ax1.hlines(1000, xmin=0, xmax=1000, alpha=0.8, color='red', linestyle='--', linewidth=1)
csf_feat = csf.logfbank(audio) assert (np.shape(psf_feat) == np.shape(csf_feat)) error2d(psf_feat, csf_feat) print '' print 'ssc' print '===' psf_ssc = psf.ssc(audio) csf_ssc = csf.ssc(audio) assert (np.shape(psf_ssc) == np.shape(csf_ssc)) error2d(psf_ssc, csf_ssc) print '' print 'hz2mel' print '======' assert (get_error(psf.hz2mel(8000), csf.hz2mel(8000)) <= acceptable_error) assert (get_error(psf.hz2mel(16000), csf.hz2mel(16000)) <= acceptable_error) assert (get_error(csf.mel2hz(csf.hz2mel(8000)), 8000) <= acceptable_error) print ' ✓' print '' print 'mel2hz' print '======' assert (get_error(psf.mel2hz(2595), csf.mel2hz(2595)) <= acceptable_error) assert (get_error(csf.mel2hz(5190), csf.mel2hz(5190)) <= acceptable_error) assert (get_error(csf.hz2mel(csf.mel2hz(2595)), 2595) <= acceptable_error) print ' ✓' print '' print 'get_filterbanks' print '==============='
def getmelpoint(_n_filt=N_FILT): lowmel = hz2mel(0) highmel = hz2mel(SAMPLING_RATE / 2) melpoints = np.linspace(lowmel, highmel, _n_filt + 1) return mel2hz(melpoints)[1:_n_filt + 1]
self.input_dim = input_dim self.num_filter = num_filter self.sr = sr <<<<<<< HEAD ======= self.exp = exp self.filter_fix = filter_fix requires_grad = not filter_fix >>>>>>> Server/Server input_freq = np.linspace(0, self.sr / 2, input_dim) self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(), requires_grad=False) centers = np.linspace(0, hz2mel(sr / 2), num_filter + 2) centers = mel2hz(centers) <<<<<<< HEAD self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1)) ======= self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1), requires_grad=requires_grad) >>>>>>> Server/Server bandwidth = [] for i in range(2, len(centers)): bandwidth.append(centers[i] - centers[i - 1]) <<<<<<< HEAD self.bandwidth = nn.Parameter(torch.tensor(bandwidth).reshape(num_filter, 1).float()) self.gain = nn.Parameter(torch.ones(num_filter, dtype=torch.float32).reshape(num_filter, 1))