def call(self, audio_data, sample_rate=None): """ Caculate mfcc features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing mfcc features of every frame in speech. """ p = self.config with tf.name_scope('mfcc'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum_feats = self.spect(audio_data, sample_rate) spectrum_feats = tf.expand_dims(spectrum_feats, 0) fbank_feats = self.fbank(audio_data, sample_rate) mfcc = py_x_ops.mfcc(fbank_feats, spectrum_feats, sample_rate, use_energy=p.use_energy, cepstral_lifter=p.cepstral_lifter, coefficient_count=p.coefficient_count) return mfcc
def call(self, audio_data, sample_rate=None): """ Caculate cepstrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech. """ p = self.config with tf.name_scope('cepstrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): cepstrum = py_x_ops.cepstrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, ceps_subband_num=p.ceps_subband_num, tag_ceps_mean_norm=p.tag_ceps_mean_norm) return cepstrum
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum and phase spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: Two returns: power spectrum —— A float tensor of size (num_frames, num_frequencies) containing power spectrum and of every frame in speech. phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing phase spectrum and of every frame in speech. """ p = self.config with tf.name_scope('analyfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) power_spectrum, phase_spectrum = py_x_ops.analyfiltbank( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return power_spectrum, phase_spectrum
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1) or log power spectrum (output_type=2) of every frame in speech. """ p = self.config with tf.name_scope('spectrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) spectrum = py_x_ops.spectrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, output_type=p.output_type, snip_edges=p.snip_edges, raw_energy=p.raw_energy, preEph_coeff=p.preeph_coeff, window_type=p.window_type, remove_dc_offset=p.remove_dc_offset, is_fbank=p.is_fbank) return spectrum
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size N containing add-noise audio. """ p = self.config with tf.name_scope('add_rir_noise_aecres'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres( audio_data, sample_rate, if_add_rir=p.if_add_rir, rir_filelist=p.rir_filelist, if_add_noise=p.if_add_noise, snr_min=p.snr_min, snr_max=p.snr_max, noise_filelist=p.noise_filelist, if_add_aecres=p.if_add_aecres, aecres_filelist=p.aecres_filelist) return tf.squeeze(add_rir_noise_aecres_out)
def call(self, audio_data, sample_rate=None): """ Caculate fbank && pitch(concat) features of wav. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: the samplerate of the signal we working with. :return: A tensor with shape (num_frames, dim_features), containing fbank && pitch feature of every frame in speech. """ p = self.config with tf.name_scope('fbank_pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate)) pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate)) fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1) return fbank_pitch_feats
def call(self, power_spectrum, phase_spectrum, sample_rate=None): """ Implement frequency domain to time domain conversion. :param power_spectrum: a float tensor of size (num_frames, num_frequencies). :param phase_spectrum: a float tensor of size (num_frames, num_frequencies). :param sample_rate: a scalar tensor. :return: audio data """ p = self.config with tf.name_scope('synthfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): audio_data = py_x_ops.synthfiltbank( power_spectrum, phase_spectrum, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return audio_data
def call(self, audio_data, sample_rate=None): """ Caculate fbank features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing fbank features of every frame in speech. """ p = self.config with tf.name_scope('fbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) if p.upper_frequency_limit <= 0: p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit elif (p.upper_frequency_limit <= p.lower_frequency_limit) or ( p.upper_frequency_limit > p.sample_rate / 2.0): p.upper_frequency_limit = p.sample_rate / 2.0 assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum = self.spect(audio_data, sample_rate) spectrum = tf.expand_dims(spectrum, 0) fbank = py_x_ops.fbank( spectrum, sample_rate, upper_frequency_limit=p.upper_frequency_limit, lower_frequency_limit=p.lower_frequency_limit, filterbank_channel_count=p.filterbank_channel_count) return fbank
def call(self, audio_data, sample_rate=None): """ Calculate the zero-crossing rate of speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech. """ p = self.config with tf.name_scope('zcr'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) zcr = py_x_ops.zcr(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return zcr
def call(self, audio_data, sample_rate=None): """ Caculate power of every frame in speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (1, num_frames) containing power of every frame in speech. """ p = self.config with tf.name_scope('framepow'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): framepow = py_x_ops.frame_pow( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return framepow
def call(self, audio_data, sample_rate=None): """ Caculate plp features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech. """ p = self.config with tf.name_scope('plp'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) plp = py_x_ops.plp(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, plp_order=p.plp_order) return plp
def accuracy(logits, labels): ''' accuracy candies params: logits: [B, ..., D] labels: [B, ...] return: accuracy tensor ''' with tf.name_scope('accuracy'): assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1) assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels)) with tf.control_dependencies([assert_rank, assert_shape]): predictions = tf.argmax(logits, axis=-1, output_type=tf.int64) labels = tf.cast(labels, tf.int64) return tf.reduce_mean( tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
def call(self, audio_data, sample_rate=None): """ Caculate pitch features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech. """ p = self.config with tf.name_scope('pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): pitch = py_x_ops.pitch(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, thres_autoc=p.thres_autoc) pitch = tf.squeeze(pitch) pitch = tf.transpose(pitch[None, :]) return pitch
def call(self, wavfile): """ Get audio data and sample rate from a wavfile. :param wavfile: filepath of wav :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav file, which is a tensor with float dtype. """ p = self.config contents = tf.io.read_file(wavfile) audio_data, sample_rate = tf.audio.decode_wav( contents, desired_channels=p.audio_channels) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate, dtype=float)
def call(self, filename, audio_data, sample_rate=None): """ Write wav using audio_data[tensor]. :param filename: filepath of wav. :param audio_data: a tensor containing data of a wav. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: write wav opration. """ p = self.config filename = tf.constant(filename) if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): audio_data = tf.cast(audio_data, dtype=tf.float32) contents = tf.audio.encode_wav( tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32)) w = tf.io.write_file(filename, contents) return w