def test_delta_delta(self): ''' test add delta detlas ''' #pylint: disable=invalid-name p = tffeat.speech_params(sr=self.sr_true, bins=40, cmvn=False, audio_desired_samples=1000, add_delta_deltas=False) with self.session(): wavfile = tf.constant(self.wavpath) audio, sample_rate = tffeat.read_wav(wavfile, self.hp) del sample_rate feature = tffeat.compute_mel_filterbank_features( audio, sample_rate=p.audio_sample_rate, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=False) feature = tffeat.delta_delta(feature, order=2) self.assertEqual(feature.eval().shape, (11, 40, 3))
def load_wav(wavpath, sr=8000): ''' audio: np.float32, shape [None], sample in [-1, 1], using librosa.load np.int16, shape [None], sample in [-32768, 32767], using scipy.io.wavfile np.float32, shape[None, audio_channel], sample int [-1, 1], using tf.DecodeWav return sr: sample rate audio: [-1, 1], same to tf.DecodeWav ''' #from scipy.io import wavfile #sample_rate, audio = wavfile.read(wavpath) #samples, sample_rate = librosa.load(wavpath, sr=sr) with tf.Session() as sess: params = speech_ops.speech_params(sr=sr, audio_desired_samples=-1) t_wavpath = tf.placeholder(dtype=tf.string) t_audio, t_sample_rate = speech_ops.read_wav(t_wavpath, params) audio, sample_rate = sess.run([t_audio, t_sample_rate], feed_dict={t_wavpath: wavpath}) audio = audio[:, 0] assert sample_rate == sr, 'sampling rate must be {}Hz, get {}Hz'.format( sr, sample_rate) return sample_rate, audio
def _freq_feat_graph(feat_name, **kwargs): winlen = kwargs.get('winlen') winstep = kwargs.get('winstep') feature_size = kwargs.get('feature_size') sr = kwargs.get('sr') #pylint: disable=invalid-name nfft = kwargs.get('nfft') del nfft assert feat_name in ('fbank', 'spec') params = speech_ops.speech_params(sr=sr, bins=feature_size, add_delta_deltas=False, audio_frame_length=winlen, audio_frame_step=winstep) graph = None if feat_name == 'fbank': # get session if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): # fbank filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) del sample_rate fbank = speech_ops.extract_feature(waveforms, params) # shape must be [T, D, C] feat = tf.identity(fbank, name=feat_name) elif feat_name == 'spec': # magnitude spec if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) spec = py_x_ops.spectrum( waveforms[:, 0], tf.cast(sample_rate, tf.dtypes.float32), output_type=1 ) #output_type: 1, power spec; 2 log power spec spec = tf.sqrt(spec) # shape must be [T, D, C] spec = tf.expand_dims(spec, -1) feat = tf.identity(spec, name=feat_name) else: raise ValueError(f"Not support freq feat: {feat_name}.") return graph, (_get_out_tensor_name('wavpath', 0), _get_out_tensor_name(feat_name, 0))
def setUp(self): ''' set up ''' self.sr_true = 8000 #pylint: disable=invalid-name self.hp = tffeat.speech_params(sr=self.sr_true, bins=40, cmvn=False, audio_desired_samples=1000, add_delta_deltas=False) self.wavpath = str( Path(os.environ['MAIN_ROOT']).joinpath( 'delta/data/feat/python_speech_features/english.wav')) _, self.audio_true = load_wav(self.wavpath, sr=self.sr_true)
def setUp(self): super().setUp() self.sr_true = 8000 #pylint: disable=invalid-name self.hp = tffeat.speech_params( sr=self.sr_true, bins=40, cmvn=False, audio_desired_samples=1000, add_delta_deltas=False) self.wavpath = str( Path(PACKAGE_ROOT_DIR).joinpath( 'data/feat/python_speech_features/english.wav')) _, self.audio_true = load_wav(self.wavpath, sr=self.sr_true)
def test_extract_logfbank_with_delta(self): ''' test logfbank with delta op''' #pylint: disable=invalid-name hp = tffeat.speech_params(sr=self.sr_true, bins=40, cmvn=False, audio_desired_samples=1000, add_delta_deltas=False) with self.session(): wavfile = tf.constant(self.wavpath) # read wav audio, sample_rate = tffeat.read_wav(wavfile, hp) del sample_rate # fbank with delta delta fbank = tffeat.extract_logfbank_with_delta(audio, hp) self.assertEqual(fbank.eval().shape, (11, 40, 1))
def extract_filterbank(*args, **kwargs): ''' tensorflow fbank feat ''' winlen = kwargs.get('winlen') winstep = kwargs.get('winstep') feature_size = kwargs.get('feature_size') sr = kwargs.get('sr') #pylint: disable=invalid-name nfft = kwargs.get('nfft') dry_run = kwargs.get('dry_run') del nfft feat_name = 'fbank' graph = None op = None # get session if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): # fbank params = speech_ops.speech_params( sr=sr, bins=feature_size, add_delta_deltas=False, audio_frame_length=winlen, audio_frame_step=winstep) filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) del sample_rate fbank = speech_ops.extract_feature(waveforms, params) fbank = tf.identity(fbank, name=feat_name) sess = _get_session(_get_out_tensor_name(feat_name, 0), graph) for wavpath in args: savepath = os.path.splitext(wavpath)[0] + '.npy' logging.debug('input: {}, output: {}'.format(wavpath, savepath)) feat = sess.run(feat_name + ":0", feed_dict={'wavpath:0': wavpath}) # save feat if dry_run: logging.info('save feat: path {} shape:{} dtype:{}'.format( savepath, feat.shape, feat.dtype)) else: np.save(savepath, feat)
def test_extract_feature(self): ''' test logfbank with delta, and cmvn ''' #pylint: disable=invalid-name hp = tffeat.speech_params(sr=self.sr_true, bins=40, cmvn=False, audio_desired_samples=1000, add_delta_deltas=True) with self.cached_session(use_gpu=False, force_gpu=False): wavfile = tf.constant(self.wavpath) # read wav audio, sample_rate = tffeat.read_wav(wavfile, hp) del sample_rate # fbank with delta delta and cmvn feature = tffeat.extract_feature(audio, hp) self.assertEqual(feature.eval().shape, (11, 40, 3))
def test_batch_extract_feature(self): ''' test batched feature extraction ''' #pylint: disable=invalid-name hp = tffeat.speech_params(sr=self.sr_true, bins=40, cmvn=False, audio_desired_samples=1000, add_delta_deltas=True) batch_size = 2 with self.session(): wavfile = tf.constant(self.wavpath) # read wav audio, sample_rate = tffeat.read_wav(wavfile, hp) del sample_rate audio = tf.stack([audio] * batch_size) # fbank with delta delta and cmvn feature = tffeat.batch_extract_feature(audio, hp) self.assertEqual(feature.eval().shape, (batch_size, 11, 40, 3))
def load_wav(wavpath, sr=8000): ''' audio: np.float32, shape [None], sample in [-1, 1], using librosa.load np.int16, shape [None], sample in [-32768, 32767], using scipy.io.wavfile np.float32, shape[None, audio_channel], sample int [-1, 1], using tf.DecodeWav return sr: sample rate audio: [-1, 1], same to tf.DecodeWav ''' #from scipy.io import wavfile #sample_rate, audio = wavfile.read(wavpath) #samples, sample_rate = librosa.load(wavpath, sr=sr) feat_name = 'load_wav' graph = None # get session if feat_name not in _global_sess: graph = tf.Graph() with graph.as_default(): params = speech_ops.speech_params(sr=sr, audio_desired_samples=-1) t_wavpath = tf.placeholder(dtype=tf.string, name="wavpath") t_audio, t_sample_rate = speech_ops.read_wav(t_wavpath, params) t_audio = tf.identity(t_audio, name="audio") t_sample_rate = tf.identity(t_sample_rate, name="sample_rate") sess = _get_session(feat_name, graph) audio, sample_rate = sess.run([ _get_out_tensor_name('audio', 0), _get_out_tensor_name('sample_rate', 0) ], feed_dict={"wavpath:0": wavpath}) audio = audio[:, 0] assert sample_rate == sr, 'sampling rate must be {}Hz, get {}Hz'.format( sr, sample_rate) return sample_rate, audio