def test_htk_compat(audio): p1 = PlpProcessor(use_energy=True, htk_compat=False, dither=0).process(audio) p2 = PlpProcessor(use_energy=True, htk_compat=True, dither=0).process(audio) assert p1.data[:, 0] == pytest.approx(p2.data[:, -1]) p1 = PlpProcessor(use_energy=False, htk_compat=False, dither=0).process(audio) p2 = PlpProcessor(use_energy=False, htk_compat=True, dither=0).process(audio) assert p1.data[:, 0] == pytest.approx(p2.data[:, -1])
def get_plp_dd(wav_fn, norm): """Return the MFCCs with deltas and delta-deltas for a audio file.""" audio = Audio.load(wav_fn) processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01, low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2) plp_static = processor.process(audio, vtln_warp=1.0) d_processor = DeltaPostProcessor(order=2) plp_deltas = d_processor.process(plp_static) features = np.float64(plp_deltas._to_dict()["data"]) if norm == "cmvn": features = (features - np.mean(features, axis=0)) / np.std(features, axis=0) return features
def test_output(audio): assert PlpProcessor(frame_shift=0.01).process(audio).shape == (140, 13) assert PlpProcessor(frame_shift=0.02).process(audio).shape == (70, 13) assert PlpProcessor(frame_shift=0.02, frame_length=0.05).process(audio).shape == (69, 13) # sample rate mismatch with pytest.raises(ValueError): PlpProcessor(sample_rate=8000).process(audio) # only mono signals are accepted with pytest.raises(ValueError): data = np.random.random((1000, 2)) stereo = Audio(data, sample_rate=16000) PlpProcessor(sample_rate=stereo.sample_rate).process(stereo)
def test_params(): assert len(PlpProcessor().get_params()) == 24 params = { 'num_bins': 0, 'use_energy': True, 'energy_floor': 10.0, 'raw_energy': False, 'htk_compat': True, 'htk_compat': True} p = PlpProcessor(**params) out_params = p.get_params() assert len(out_params) == 24 assert PlpProcessor().set_params(**params).get_params() == out_params
def test_raw(audio, raw_energy): p = {'raw_energy': raw_energy, 'dither': 0} mfcc = MfccProcessor(**p).process(audio).data[:, 0] plp = PlpProcessor(**p).process(audio).data[:, 0] energy = EnergyProcessor(**p).process(audio).data[:, 0] assert np.allclose(mfcc, energy) assert np.allclose(plp, energy)
def test_num_ceps(audio, num_ceps): if num_ceps >= 23: with pytest.raises(ValueError) as err: PlpProcessor(num_ceps=num_ceps) assert 'We must have num_ceps <= lpc_order+1' in str(err) else: proc = PlpProcessor(num_ceps=num_ceps) if 0 < proc.num_ceps: feat = proc.process(audio) assert proc.num_ceps == num_ceps == proc.ndims assert feat.shape == (140, num_ceps) proc.use_energy = False feat = proc.process(audio) assert feat.shape == (140, num_ceps) else: with pytest.raises(RuntimeError): proc.process(audio)
def get_features(sound_file, chosen_processor): # computes the feature coefficients of a sound file # :param sound_file : sound file in format .wav # :type amount: .wav file # :returns: feature coefficients per frame of 25ms every 10ms can be 'filterbank' # 'plp', 'rasteplp' or 'bottleneck' # :rtype: a numpy array audio = Audio.load(sound_file) processors = { 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti') } features = chosen_processor.process(audio) features = pd.DataFrame(features) return (features)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('wav', help='wav file to compute features on') # load the wav file wav_file = parser.parse_args().wav audio = Audio.load(wav_file) # initialize features processors processors = { 'spectrogram': SpectrogramProcessor(sample_rate=audio.sample_rate), 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'mfcc': MfccProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti')} # compute the features for all processors features = {k: v.process(audio) for k, v in processors.items()} # plot the audio signal and the resulting features fig, axes = plt.subplots( nrows=len(processors)+1, gridspec_kw={'top': 0.95, 'bottom': 0.05, 'hspace': 0}, subplot_kw={'xticks': [], 'yticks': []}) time = np.arange(0.0, audio.nsamples) / audio.sample_rate axes[0].plot(time, audio.astype(np.float32).data) axes[0].set_xlim(0.0, audio.duration) axes[0].text( 0.02, 0.8, 'audio', bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[0].transAxes) for n, (k, v) in enumerate(features.items(), start=1): axes[n].imshow(v.data.T, aspect='auto') axes[n].text( 0.02, 0.8, k, bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[n].transAxes) plt.show()