def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1) or log power spectrum (output_type=2) of every frame in speech. """ p = self.config with tf.name_scope('spectrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) spectrum = py_x_ops.spectrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, output_type=p.output_type, snip_edges=p.snip_edges, raw_energy=p.raw_energy, preEph_coeff=p.preeph_coeff, window_type=p.window_type, remove_dc_offset=p.remove_dc_offset, is_fbank=p.is_fbank) return spectrum
def _freq_feat_graph(feat_name, **kwargs): winlen = kwargs.get('winlen') winstep = kwargs.get('winstep') feature_size = kwargs.get('feature_size') sr = kwargs.get('sr') #pylint: disable=invalid-name nfft = kwargs.get('nfft') del nfft assert feat_name in ('fbank', 'spec') params = speech_ops.speech_params(sr=sr, bins=feature_size, add_delta_deltas=False, audio_frame_length=winlen, audio_frame_step=winstep) graph = None if feat_name == 'fbank': # get session if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): # fbank filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) del sample_rate fbank = speech_ops.extract_feature(waveforms, params) # shape must be [T, D, C] feat = tf.identity(fbank, name=feat_name) elif feat_name == 'spec': # magnitude spec if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) spec = py_x_ops.spectrum( waveforms[:, 0], tf.cast(sample_rate, tf.dtypes.float32), output_type=1 ) #output_type: 1, power spec; 2 log power spec spec = tf.sqrt(spec) # shape must be [T, D, C] spec = tf.expand_dims(spec, -1) feat = tf.identity(spec, name=feat_name) else: raise ValueError(f"Not support freq feat: {feat_name}.") return graph, (_get_out_tensor_name('wavpath', 0), _get_out_tensor_name(feat_name, 0))
def test_spectrum(self): ''' test spectrum op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.spectrum(input_data, sample_rate) output_true = np.array( [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686], [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851], [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926], [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091], [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]]) self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of spectrum: {}'.format(output.shape)) self.assertAllClose(output.eval()[4:9, 4:9], output_true)
def test_spectrum(self): ''' test spectrum op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.spectrum(input_data, sample_rate) output_true = np.array([ -16.018925, -16.491777, -16.903442, -18.108875, -19.477205, -19.039738, -17.066263, -16.530647, -16.033670, -15.492795, -15.347169, -16.443783, -15.385968, -15.631793, -16.286760, -16.555447, -15.107640, -15.158586, -16.397518, -14.803325, -15.173873, -15.785010, -15.551179, -15.487743, -15.732930, -15.610220, -15.314099, -14.765355, -14.572725, -13.482535, -13.463938, -14.457010, -16.253452, -15.444997, -13.472414, -12.852523, -13.163157, -13.957175, -14.148843, -13.527264, -12.840333, -13.056757, -14.582790, -13.900843, -13.864534, -14.037180, -15.386706, -16.500109, -16.309618, -13.585808 ]) self.assertEqual(tf.rank(output).eval(), 1) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_spectrum(self): ''' test spectrum op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) logging.info( f"input shape: {input_data.shape}, sample rate dtype: {sample_rate.dtype}" ) self.assertEqual(sample_rate, 16000) output = py_x_ops.spectrum(input_data, sample_rate) #pylint: disable=bad-whitespace output_true = np.array( [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686], [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851], [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926], [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091], [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]]) #pylint: enable=bad-whitespace self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of spectrum: {}'.format(output.shape)) self.assertAllClose(output.eval()[4:9, 4:9], output_true)