Ejemplo n.º 1
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum or log power spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1)
        or log power spectrum (output_type=2) of every frame in speech.
    """

        p = self.config
        with tf.name_scope('spectrum'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                spectrum = py_x_ops.spectrum(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length,
                    output_type=p.output_type,
                    snip_edges=p.snip_edges,
                    raw_energy=p.raw_energy,
                    preEph_coeff=p.preeph_coeff,
                    window_type=p.window_type,
                    remove_dc_offset=p.remove_dc_offset,
                    is_fbank=p.is_fbank)

                return spectrum
Ejemplo n.º 2
0
def _freq_feat_graph(feat_name, **kwargs):
    winlen = kwargs.get('winlen')
    winstep = kwargs.get('winstep')
    feature_size = kwargs.get('feature_size')
    sr = kwargs.get('sr')  #pylint: disable=invalid-name
    nfft = kwargs.get('nfft')
    del nfft

    assert feat_name in ('fbank', 'spec')

    params = speech_ops.speech_params(sr=sr,
                                      bins=feature_size,
                                      add_delta_deltas=False,
                                      audio_frame_length=winlen,
                                      audio_frame_step=winstep)

    graph = None
    if feat_name == 'fbank':
        # get session
        if feat_name not in _global_sess:
            graph = tf.Graph()
            #pylint: disable=not-context-manager
            with graph.as_default():
                # fbank
                filepath = tf.placeholder(dtype=tf.string,
                                          shape=[],
                                          name='wavpath')
                waveforms, sample_rate = speech_ops.read_wav(filepath, params)
                del sample_rate
                fbank = speech_ops.extract_feature(waveforms, params)
                # shape must be [T, D, C]
                feat = tf.identity(fbank, name=feat_name)
    elif feat_name == 'spec':
        # magnitude spec
        if feat_name not in _global_sess:
            graph = tf.Graph()
            #pylint: disable=not-context-manager
            with graph.as_default():
                filepath = tf.placeholder(dtype=tf.string,
                                          shape=[],
                                          name='wavpath')
                waveforms, sample_rate = speech_ops.read_wav(filepath, params)

                spec = py_x_ops.spectrum(
                    waveforms[:, 0],
                    tf.cast(sample_rate, tf.dtypes.float32),
                    output_type=1
                )  #output_type: 1, power spec; 2 log power spec
                spec = tf.sqrt(spec)
                # shape must be [T, D, C]
                spec = tf.expand_dims(spec, -1)
                feat = tf.identity(spec, name=feat_name)
    else:
        raise ValueError(f"Not support freq feat: {feat_name}.")

    return graph, (_get_out_tensor_name('wavpath',
                                        0), _get_out_tensor_name(feat_name, 0))
Ejemplo n.º 3
0
    def test_spectrum(self):
        ''' test spectrum op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.spectrum(input_data, sample_rate)

            output_true = np.array(
                [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686],
                 [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851],
                 [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926],
                 [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091],
                 [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]])
            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of spectrum: {}'.format(output.shape))
            self.assertAllClose(output.eval()[4:9, 4:9], output_true)
Ejemplo n.º 4
0
    def test_spectrum(self):
        ''' test spectrum op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.spectrum(input_data, sample_rate)

            output_true = np.array([
                -16.018925, -16.491777, -16.903442, -18.108875, -19.477205,
                -19.039738, -17.066263, -16.530647, -16.033670, -15.492795,
                -15.347169, -16.443783, -15.385968, -15.631793, -16.286760,
                -16.555447, -15.107640, -15.158586, -16.397518, -14.803325,
                -15.173873, -15.785010, -15.551179, -15.487743, -15.732930,
                -15.610220, -15.314099, -14.765355, -14.572725, -13.482535,
                -13.463938, -14.457010, -16.253452, -15.444997, -13.472414,
                -12.852523, -13.163157, -13.957175, -14.148843, -13.527264,
                -12.840333, -13.056757, -14.582790, -13.900843, -13.864534,
                -14.037180, -15.386706, -16.500109, -16.309618, -13.585808
            ])
            self.assertEqual(tf.rank(output).eval(), 1)
            self.assertAllClose(output.eval().flatten()[:50], output_true)
Ejemplo n.º 5
0
    def test_spectrum(self):
        ''' test spectrum op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)
            logging.info(
                f"input shape: {input_data.shape}, sample rate dtype: {sample_rate.dtype}"
            )
            self.assertEqual(sample_rate, 16000)

            output = py_x_ops.spectrum(input_data, sample_rate)

            #pylint: disable=bad-whitespace
            output_true = np.array(
                [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686],
                 [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851],
                 [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926],
                 [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091],
                 [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]])
            #pylint: enable=bad-whitespace

            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of spectrum: {}'.format(output.shape))
            self.assertAllClose(output.eval()[4:9, 4:9], output_true)