コード例 #1
0
ファイル: speech_feature.py プロジェクト: yanliu1/delta
def _freq_feat_graph(feat_name, **kwargs):
    winlen = kwargs.get('winlen')
    winstep = kwargs.get('winstep')
    feature_size = kwargs.get('feature_size')
    sr = kwargs.get('sr')  #pylint: disable=invalid-name
    nfft = kwargs.get('nfft')
    del nfft

    assert feat_name in ('fbank', 'spec')

    params = speech_ops.speech_params(sr=sr,
                                      bins=feature_size,
                                      add_delta_deltas=False,
                                      audio_frame_length=winlen,
                                      audio_frame_step=winstep)

    graph = None
    if feat_name == 'fbank':
        # get session
        if feat_name not in _global_sess:
            graph = tf.Graph()
            #pylint: disable=not-context-manager
            with graph.as_default():
                # fbank
                filepath = tf.placeholder(dtype=tf.string,
                                          shape=[],
                                          name='wavpath')
                waveforms, sample_rate = speech_ops.read_wav(filepath, params)
                del sample_rate
                fbank = speech_ops.extract_feature(waveforms, params)
                # shape must be [T, D, C]
                feat = tf.identity(fbank, name=feat_name)
    elif feat_name == 'spec':
        # magnitude spec
        if feat_name not in _global_sess:
            graph = tf.Graph()
            #pylint: disable=not-context-manager
            with graph.as_default():
                filepath = tf.placeholder(dtype=tf.string,
                                          shape=[],
                                          name='wavpath')
                waveforms, sample_rate = speech_ops.read_wav(filepath, params)

                spec = py_x_ops.spectrum(
                    waveforms[:, 0],
                    tf.cast(sample_rate, tf.dtypes.float32),
                    output_type=1
                )  #output_type: 1, power spec; 2 log power spec
                spec = tf.sqrt(spec)
                # shape must be [T, D, C]
                spec = tf.expand_dims(spec, -1)
                feat = tf.identity(spec, name=feat_name)
    else:
        raise ValueError(f"Not support freq feat: {feat_name}.")

    return graph, (_get_out_tensor_name('wavpath',
                                        0), _get_out_tensor_name(feat_name, 0))
コード例 #2
0
    def test_delta_delta(self):
        ''' test add delta detlas '''
        #pylint: disable=invalid-name
        p = tffeat.speech_params(sr=self.sr_true,
                                 bins=40,
                                 cmvn=False,
                                 audio_desired_samples=1000,
                                 add_delta_deltas=False)

        with self.session():
            wavfile = tf.constant(self.wavpath)
            audio, sample_rate = tffeat.read_wav(wavfile, self.hp)
            del sample_rate

            feature = tffeat.compute_mel_filterbank_features(
                audio,
                sample_rate=p.audio_sample_rate,
                preemphasis=p.audio_preemphasis,
                frame_length=p.audio_frame_length,
                frame_step=p.audio_frame_step,
                lower_edge_hertz=p.audio_lower_edge_hertz,
                upper_edge_hertz=p.audio_upper_edge_hertz,
                num_mel_bins=p.audio_num_mel_bins,
                apply_mask=False)

            feature = tffeat.delta_delta(feature, order=2)
            self.assertEqual(feature.eval().shape, (11, 40, 3))
コード例 #3
0
def load_wav(wavpath, sr=8000):
    '''
  audio:
    np.float32, shape [None], sample in [-1, 1], using librosa.load
    np.int16, shape [None], sample in [-32768, 32767], using scipy.io.wavfile
    np.float32, shape[None, audio_channel], sample int [-1, 1], using tf.DecodeWav

  return
    sr: sample rate
    audio: [-1, 1], same to tf.DecodeWav
  '''
    #from scipy.io import wavfile
    #sample_rate, audio = wavfile.read(wavpath)

    #samples, sample_rate = librosa.load(wavpath, sr=sr)

    with tf.Session() as sess:
        params = speech_ops.speech_params(sr=sr, audio_desired_samples=-1)
        t_wavpath = tf.placeholder(dtype=tf.string)
        t_audio, t_sample_rate = speech_ops.read_wav(t_wavpath, params)

        audio, sample_rate = sess.run([t_audio, t_sample_rate],
                                      feed_dict={t_wavpath: wavpath})
        audio = audio[:, 0]

    assert sample_rate == sr, 'sampling rate must be {}Hz, get {}Hz'.format(
        sr, sample_rate)
    return sample_rate, audio
コード例 #4
0
    def test_read_wav(self):
        ''' test read wav op '''
        with self.session():
            wavfile = tf.constant(self.wavpath)
            # read wav
            audio, sample_rate = tffeat.read_wav(wavfile, self.hp)
            self.assertEqual(sample_rate.eval(), self.sr_true)

            self.assertEqual(audio.eval().shape, (1000, 1))
            self.assertAllEqual(audio.eval()[:, 0], self.audio_true[:1000])
コード例 #5
0
    def test_extract_logfbank_with_delta(self):
        ''' test logfbank with delta op'''
        #pylint: disable=invalid-name
        hp = tffeat.speech_params(sr=self.sr_true,
                                  bins=40,
                                  cmvn=False,
                                  audio_desired_samples=1000,
                                  add_delta_deltas=False)

        with self.session():
            wavfile = tf.constant(self.wavpath)
            # read wav
            audio, sample_rate = tffeat.read_wav(wavfile, hp)
            del sample_rate
            # fbank with delta delta
            fbank = tffeat.extract_logfbank_with_delta(audio, hp)
            self.assertEqual(fbank.eval().shape, (11, 40, 1))
コード例 #6
0
ファイル: speech_feature.py プロジェクト: xpsheng/delta
def extract_filterbank(*args, **kwargs):
  ''' tensorflow fbank feat '''
  winlen = kwargs.get('winlen')
  winstep = kwargs.get('winstep')
  feature_size = kwargs.get('feature_size')
  sr = kwargs.get('sr')  #pylint: disable=invalid-name
  nfft = kwargs.get('nfft')
  dry_run = kwargs.get('dry_run')
  del nfft

  feat_name = 'fbank'
  graph = None
  op = None
  # get session
  if feat_name not in _global_sess:
    graph = tf.Graph()
    #pylint: disable=not-context-manager
    with graph.as_default():
      # fbank
      params = speech_ops.speech_params(
          sr=sr,
          bins=feature_size,
          add_delta_deltas=False,
          audio_frame_length=winlen,
          audio_frame_step=winstep)

      filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath')
      waveforms, sample_rate = speech_ops.read_wav(filepath, params)
      del sample_rate
      fbank = speech_ops.extract_feature(waveforms, params)
      fbank = tf.identity(fbank, name=feat_name)

  sess = _get_session(_get_out_tensor_name(feat_name, 0), graph)

  for wavpath in args:
    savepath = os.path.splitext(wavpath)[0] + '.npy'
    logging.debug('input: {}, output: {}'.format(wavpath, savepath))

    feat = sess.run(feat_name + ":0", feed_dict={'wavpath:0': wavpath})

    # save feat
    if dry_run:
      logging.info('save feat: path {} shape:{} dtype:{}'.format(
          savepath, feat.shape, feat.dtype))
    else:
      np.save(savepath, feat)
コード例 #7
0
    def test_extract_feature(self):
        ''' test logfbank with delta, and cmvn '''
        #pylint: disable=invalid-name
        hp = tffeat.speech_params(sr=self.sr_true,
                                  bins=40,
                                  cmvn=False,
                                  audio_desired_samples=1000,
                                  add_delta_deltas=True)

        with self.cached_session(use_gpu=False, force_gpu=False):
            wavfile = tf.constant(self.wavpath)
            # read wav
            audio, sample_rate = tffeat.read_wav(wavfile, hp)
            del sample_rate

            # fbank with delta delta and cmvn
            feature = tffeat.extract_feature(audio, hp)

            self.assertEqual(feature.eval().shape, (11, 40, 3))
コード例 #8
0
ファイル: speech_ops_test.py プロジェクト: zhankm/delta
  def test_powspec_feat(self):
    ''' test spectrogram op '''
    with self.cached_session(use_gpu=False, force_gpu=False):
      wavfile = tf.constant(self.wavpath)
      # read wav
      audio, sample_rate = tffeat.read_wav(wavfile, self.hp)
      del sample_rate
      # spectorgram
      spectrogram = tffeat.powspec_feat(
          audio,
          sr=self.sr_true,
          nfft=None,
          winlen=self.hp.audio_frame_length,
          winstep=self.hp.audio_frame_step,
          lowfreq=self.hp.audio_lower_edge_hertz,
          highfreq=self.hp.audio_upper_edge_hertz,
          preemph=self.hp.audio_preemphasis)

      nfft = int(np.log2(self.hp.audio_frame_length * self.sr_true)) + 1
      nfft = 1 << nfft
      self.assertEqual(spectrogram.eval().shape, (1, 11, int(nfft / 2 + 1)))
コード例 #9
0
    def test_batch_extract_feature(self):
        ''' test batched feature extraction '''
        #pylint: disable=invalid-name
        hp = tffeat.speech_params(sr=self.sr_true,
                                  bins=40,
                                  cmvn=False,
                                  audio_desired_samples=1000,
                                  add_delta_deltas=True)

        batch_size = 2
        with self.session():
            wavfile = tf.constant(self.wavpath)
            # read wav
            audio, sample_rate = tffeat.read_wav(wavfile, hp)
            del sample_rate

            audio = tf.stack([audio] * batch_size)

            # fbank with delta delta and cmvn
            feature = tffeat.batch_extract_feature(audio, hp)

            self.assertEqual(feature.eval().shape, (batch_size, 11, 40, 3))
コード例 #10
0
def load_wav(wavpath, sr=8000):
    '''
  audio:
    np.float32, shape [None], sample in [-1, 1], using librosa.load
    np.int16, shape [None], sample in [-32768, 32767], using scipy.io.wavfile
    np.float32, shape[None, audio_channel], sample int [-1, 1], using tf.DecodeWav

  return
    sr: sample rate
    audio: [-1, 1], same to tf.DecodeWav
  '''
    #from scipy.io import wavfile
    #sample_rate, audio = wavfile.read(wavpath)

    #samples, sample_rate = librosa.load(wavpath, sr=sr)

    feat_name = 'load_wav'
    graph = None
    # get session
    if feat_name not in _global_sess:
        graph = tf.Graph()
        with graph.as_default():
            params = speech_ops.speech_params(sr=sr, audio_desired_samples=-1)
            t_wavpath = tf.placeholder(dtype=tf.string, name="wavpath")
            t_audio, t_sample_rate = speech_ops.read_wav(t_wavpath, params)
            t_audio = tf.identity(t_audio, name="audio")
            t_sample_rate = tf.identity(t_sample_rate, name="sample_rate")

    sess = _get_session(feat_name, graph)
    audio, sample_rate = sess.run([
        _get_out_tensor_name('audio', 0),
        _get_out_tensor_name('sample_rate', 0)
    ],
                                  feed_dict={"wavpath:0": wavpath})
    audio = audio[:, 0]

    assert sample_rate == sr, 'sampling rate must be {}Hz, get {}Hz'.format(
        sr, sample_rate)
    return sample_rate, audio