def call(self, audio_data, sample_rate=None): """ Caculate power spectrum and phase spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: Two returns: power spectrum —— A float tensor of size (num_frames, num_frequencies) containing power spectrum and of every frame in speech. phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing phase spectrum and of every frame in speech. """ p = self.config with tf.name_scope('analyfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) power_spectrum, phase_spectrum = py_x_ops.analyfiltbank( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return power_spectrum, phase_spectrum
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size N containing add-noise audio. """ p = self.config with tf.name_scope('add_rir_noise_aecres'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres( audio_data, sample_rate, if_add_rir=p.if_add_rir, rir_filelist=p.rir_filelist, if_add_noise=p.if_add_noise, snr_min=p.snr_min, snr_max=p.snr_max, noise_filelist=p.noise_filelist, if_add_aecres=p.if_add_aecres, aecres_filelist=p.aecres_filelist) return tf.squeeze(add_rir_noise_aecres_out)
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1) or log power spectrum (output_type=2) of every frame in speech. """ p = self.config with tf.name_scope('spectrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) spectrum = py_x_ops.spectrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, output_type=p.output_type, snip_edges=p.snip_edges, raw_energy=p.raw_energy, preEph_coeff=p.preeph_coeff, window_type=p.window_type, remove_dc_offset=p.remove_dc_offset, is_fbank=p.is_fbank) return spectrum
def call(self, audio_data, sample_rate=None): """ Caculate fbank features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing fbank features of every frame in speech. """ p = self.config with tf.name_scope('fbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): spectrum = self.spect(audio_data, sample_rate) spectrum = tf.expand_dims(spectrum, 0) sample_rate = tf.cast(sample_rate, dtype=tf.int32) fbank = py_x_ops.fbank( spectrum, sample_rate, upper_frequency_limit=p.upper_frequency_limit, lower_frequency_limit=p.lower_frequency_limit, filterbank_channel_count=p.filterbank_channel_count) return fbank
def call(self, audio_data, sample_rate=None): """ Caculate cepstrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech. """ p = self.config with tf.name_scope('cepstrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) cepstrum = py_x_ops.cepstrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, ceps_subband_num=p.ceps_subband_num, tag_ceps_mean_norm=p.tag_ceps_mean_norm) return cepstrum
def call(self, audio_data, sample_rate=None): """ Caculate plp features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech. """ p = self.config with tf.name_scope('plp'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) plp = py_x_ops.plp(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, plp_order=p.plp_order) return plp
def call(self, audio_data, sample_rate=None): """ Calculate the zero-crossing rate of speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech. """ p = self.config with tf.name_scope('zcr'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) zcr = py_x_ops.zcr(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return zcr
def call(self, audio_data, sample_rate=None): """ Caculate power of every frame in speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (1 * num_frames) containing power of every frame in speech. """ p = self.config with tf.name_scope('framepow'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) framepow = py_x_ops.frame_pow( audio_data, sample_rate, snip_edges=p.snip_edges, remove_dc_offset=p.remove_dc_offset, window_length=p.window_length, frame_length=p.frame_length) return tf.squeeze(framepow)
def _dpool_index(one_length_left, one_length_right, fixed_length_left, fixed_length_right): logging.info("fixed_length_left: {}".format(fixed_length_left)) logging.info("fixed_length_right: {}".format(fixed_length_right)) if one_length_left == 0: stride_left = fixed_length_left else: stride_left = 1.0 * fixed_length_left / tf.cast( one_length_left, dtype=tf.float32) if one_length_right == 0: stride_right = fixed_length_right else: stride_right = 1.0 * fixed_length_right / tf.cast( one_length_right, dtype=tf.float32) one_idx_left = [ tf.cast(i / stride_left, dtype=tf.int32) for i in range(fixed_length_left) ] one_idx_right = [ tf.cast(i / stride_right, dtype=tf.int32) for i in range(fixed_length_right) ] mesh1, mesh2 = tf.meshgrid(one_idx_left, one_idx_right) index_one = tf.transpose(tf.stack([mesh1, mesh2]), (2, 1, 0)) return index_one
def call(self, audio_data, sample_rate=None): """ Caculate mfcc features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: the samplerate of the signal we working with. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing mfcc features of every frame in speech. """ p = self.config with tf.name_scope('mfcc'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): fbank_feats = self.fbank(audio_data, sample_rate) sample_rate = tf.cast(sample_rate, dtype=tf.int32) shape = tf.shape(fbank_feats) nframe = shape[0] nfbank = shape[1] fbank_feats = tf.reshape(fbank_feats, (1, nframe, nfbank)) framepow_feats = self.framepow(audio_data, sample_rate) mfcc = py_x_ops.mfcc(fbank_feats, framepow_feats, sample_rate, use_energy=p.use_energy, cepstral_lifter=p.cepstral_lifter, coefficient_count=p.coefficient_count) return mfcc
def get_learning_rate(self): """Get the learning rate.""" lrconf = self.config['solver']['optimizer']['learning_rate'] learning_rate = lrconf['rate'] learning_type = lrconf['type'] #pylint: disable=invalid-name if learning_type == 'exp_decay': lr = tf.train.exponential_decay( learning_rate, tf.train.get_or_create_global_step(), lrconf['decay_steps'], lrconf['decay_rate'], staircase=True) elif learning_type == 'piecewise': #boundaries = [15000, 30000] #values = [1e-3, 1e-4, 1e-5] boundaries = lrconf['boundaries'] values = lrconf['values'] assert len(values) == len( boundaries) + 1, 'values len must equal boundaries len plus one' lr = tf.train.piecewise_constant( tf.train.get_or_create_global_step(), boundaries=boundaries, values=values) elif learning_type == 'warmup': learning_rate = tf.constant( value=learning_rate, shape=[], dtype=tf.float32) global_step = tf.train.get_or_create_global_step() data_size = self.config['data']['train_data_size'] num_epochs = self.config["data"]["task"]['epochs'] batch_size = self.config["data"]["task"]['batch_size'] num_batch = int(math.ceil(data_size * num_epochs / batch_size)) learning_rate = tf.train.polynomial_decay( learning_rate, global_step, num_batch, end_learning_rate=0.0, power=1.0, cycle=False) global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(lrconf['num_warmup_steps'], dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = learning_rate * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) lr = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) elif learning_type == 'const': lr = learning_rate else: raise ValueError( "Not support learning rate type: {}".format(learning_type)) tf.summary.scalar('lr', lr) return lr
def compute_doc_lens(sen_lens): """ Count how many sentences in a document. inputs: [..., time_steps] doc_lens: [...] """ x_binary = tf.cast(tf.cast(sen_lens, tf.bool), tf.int32) doc_lens = tf.reduce_sum(x_binary, axis=-1) return doc_lens
def compute_lens(inputs, max_len): """count sequence length. input: [batch_size, max_len] lens: [batch_size] """ x_binary = tf.cast(tf.cast(tf.reverse(inputs, axis=[1]), tf.bool), tf.int32) lens = max_len - tf.argmax(x_binary, axis=1, output_type=tf.int32) zeros = tf.zeros_like(lens, dtype=tf.int32) x_sum = tf.reduce_sum(inputs, axis=1) sen_lens = tf.where(tf.equal(x_sum, 0), zeros, lens) return sen_lens
def call(self, wavfile): """ Get audio data and sample rate from a wavfile. :param wavfile: filepath of wav :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav file, which is a tensor with float dtype. """ p = self.config contents = tf.io.read_file(wavfile) audio_data, sample_rate = tf.audio.decode_wav( contents, desired_channels=p.audio_channels) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate, dtype=float)
def get_pos_embedding_matrix(max_len, embed_dim, use_const, name): """ generate position embedding matrix, two optional types: constant(untrainable) and trainable. Args: max_len, embed_dim, use_const Return: pos_embed: [max_len, embed_dim] """ # First part of the PE function: sin and cos argument if use_const: pos_embed = np.array([[ pos / np.power(10000, (i - i % 2) / embed_dim) for i in range(embed_dim) ] for pos in range(max_len)]) # Second part, apply the cosine to even columns and sin to odds. pos_embed[:, 0::2] = np.sin(pos_embed[:, 0::2]) # dim 2i pos_embed[:, 1::2] = np.cos(pos_embed[:, 1::2]) # dim 2i+1 pos_embed = pos_embed[np.newaxis, ...] pos_embed = tf.cast(pos_embed, dtype=tf.float32) else: pos_embed = tf.get_variable( name=name, shape=[max_len, embed_dim], initializer=tf.random_uniform_initializer(-0.1, 0.1)) pos_embed = tf.expand_dims(pos_embed, 0) return pos_embed
def scaled_dot_product_attention(q, k, v, mask): """ The implementation of scaled attention. Args: v: (batch_size, seq_len_v, hidden_size) k: (batch_size, seq_len_k, hidden_size) q: (batch_size, seq_len_q, hidden_size) mask: (batch_size, seq_len_q, seq_len_k) Returns: output: (batch_size, seq_len_q, hidden_size) attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k) """ matmul_qk = tf.matmul( q, k, transpose_b=True) # (batch_size, seq_len_q, seq_len_k) # Scaled dk = tf.cast(tf.shape(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) # Masked if mask is not None: scaled_attention_logits += (mask * -1e9) # Normalized attention_weights = tf.nn.softmax( scaled_attention_logits, axis=-1) # (batch_size, seq_len_q, seq_len_k) # Weighted sum output = tf.matmul(attention_weights, v) # (batch_size, seq_len_q, depth_v) return output, attention_weights
def accuracy(logits, labels): ''' accuracy candies params: logits: [B, ..., D] labels: [B, ...] return: accuracy tensor ''' with tf.name_scope('accuracy'): assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1) assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels)) with tf.control_dependencies([assert_rank, assert_shape]): predictions = tf.argmax(logits, axis=-1, output_type=tf.int64) labels = tf.cast(labels, tf.int64) return tf.reduce_mean( tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
def __init__(self, config, emb_layer, vocab_size, **kwargs): model_config = config['model']['net']['structure'] self.is_infer = config['model']['is_infer'] if self.is_infer: self.length_penalty = model_config['length_penalty'] self.dropout_rate = model_config['dropout_rate'] self.num_layers = model_config['num_layers'] self.l2_reg_lambda = model_config['l2_reg_lambda'] self.embedding_size = model_config['embedding_size'] self.max_enc_len = model_config['max_enc_len'] self.max_dec_len = model_config['max_dec_len'] self.share_embedding = model_config['share_embedding'] self.padding_token = 0 self.beam_size = model_config['beam_size'] self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast( tf.not_equal(inputs, self.padding_token), tf.int32)) self.embed = emb_layer self.vocab_size = vocab_size self.embed_d = tf.keras.layers.Dropout(self.dropout_rate) self.pos_embed = PositionEmbedding(self.max_enc_len, self.embedding_size) self.transformer_decoders = [ TransformerDecoderLayer(config) for _ in range(self.num_layers) ] self.final_dense = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(self.vocab_size, name="final_dense")) super().__init__(**kwargs)
def __init__(self, config, **kwargs): super().__init__(config, **kwargs) tf.logging.info("Initialize TransformerModel...") model_config = config['model']['net']['structure'] self.is_infer = config['model']['is_infer'] if self.is_infer: self.length_penalty = model_config['length_penalty'] self.dropout_rate = model_config['dropout_rate'] self.num_layers = model_config['num_layers'] self.l2_reg_lambda = model_config['l2_reg_lambda'] self.max_enc_len = model_config['max_enc_len'] self.max_dec_len = model_config['max_dec_len'] self.share_embedding = model_config['share_embedding'] self.padding_token = utils.PAD_IDX self.beam_size = model_config['beam_size'] self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast( tf.not_equal(inputs, self.padding_token), tf.int32)) self.embed_d = tf.keras.layers.Dropout(self.dropout_rate) self.pos_embed = layers.PositionEmbedding(self.max_enc_len, self.embedding_size) self.encoder = layers.TransformerEncoder(config) self.decoder = layers.TransformerDecoder(config, self.embed, self.decode_vocab_size) logging.info("decode_vocab_size: {}".format(self.decode_vocab_size)) logging.info("Initialize TransformerModel done.")
def call(self, audio_data, sample_rate=None): """ Caculate pitch features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech. """ p = self.config with tf.name_scope('pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): pitch = py_x_ops.pitch(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, thres_autoc=p.thres_autoc) pitch = tf.squeeze(pitch) pitch = tf.transpose(pitch[None, :]) return pitch
def call(self, power_spectrum, phase_spectrum, sample_rate=None): """ Implement frequency domain to time domain conversion. :param power_spectrum: a float tensor of size (num_frames, num_frequencies). :param phase_spectrum: a float tensor of size (num_frames, num_frequencies). :param sample_rate: a scalar tensor. :return: audio data """ p = self.config with tf.name_scope('synthfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): audio_data = py_x_ops.synthfiltbank( power_spectrum, phase_spectrum, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return audio_data
def call(self, audio_data, sample_rate=None): """ Caculate fbank && pitch(concat) features of wav. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: the samplerate of the signal we working with. :return: A tensor with shape (num_frames, dim_features), containing fbank && pitch feature of every frame in speech. """ p = self.config with tf.name_scope('fbank_pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate)) pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate)) fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1) return fbank_pitch_feats
def grad_sparsity(self): # If the sparse minibatch gradient has 10 percent of its entries # non-zero, its sparsity is 0.1. # The norm of dense gradient averaged from full dataset # are roughly estimated norm of minibatch # sparse gradient norm * sqrt(sparsity) # An extension maybe only correct the sparse blob. non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads]) all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads]) self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \ / tf.cast(all_entry_cnt, self._grads[0].dtype) avg_op = self._moving_averager.apply([ self._sparsity, ]) with tf.control_dependencies([avg_op]): self._sparsity_avg = self._moving_averager.average(self._sparsity) return avg_op
def get_pad_mask_from_token_idx(inputs, pad_idx): """ get padding mask from the input token idx inputs: [batch_size, time_steps] mask: [batch_size, time_steps] """ pad_mask = tf.cast(tf.math.greater(inputs, pad_idx), tf.int32) return pad_mask
def get_seg_mask_from_token_idx(inputs, seg_idx): """ get padding mask from the input token idx inputs: [batch_size, time_steps] mask: [batch_size, time_steps] """ seg_mask = tf.cast(tf.math.equal(inputs, seg_idx), tf.int32) return seg_mask
def ctc_greedy_decode_lambda_func(args): y_pred, input_length = args input_length = tf.cast(input_length, dtype=tf.int32) decode_result, _ = ctc_greedy_decode(logits=y_pred, sequence_length=input_length, merge_repeated=True, blank_id=None) return decode_result
def get_expand_pad_mask(inputs, pad_idx): """ get padding mask from the input token idx inputs: [batch_size, time_steps] mask: [batch_size, time_steps, 1] """ pad_mask = tf.cast(tf.math.greater(inputs, pad_idx), tf.float32) pad_mask = tf.expand_dims(pad_mask, -1) return pad_mask
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0): ''' ctc loss function psram: logits, (B, T, D) psram: input_length, (B, 1), input length of encoder psram: labels, (B, T) psram: label_length, (B, 1), label length for convert dense label to sparse returns: loss, scalar ''' ilen = tf.cond( pred=tf.equal(tf.rank(input_length), 1), true_fn=lambda: input_length, false_fn=lambda: tf.squeeze(input_length), ) ilen = tf.cast(ilen, tf.int32) olen = tf.cond( pred=tf.equal(tf.rank(label_length), 1), true_fn=lambda: label_length, false_fn=lambda: tf.squeeze(label_length)) olen = tf.cast(olen, tf.int32) deps = [ tf.assert_rank(labels, 2, name='label_rank_check'), tf.assert_rank(logits, 3, name='logits_rank_check'), tf.assert_rank(ilen, 1, name='src_len_rank_check'), # input_length tf.assert_rank(olen, 1, name='tgt_len_rank_check'), # output_length ] labels, logits = ctc_data_transform(labels, logits, blank_index) with tf.control_dependencies(deps): # (B, 1) # blank index is consistent with Espnet, zero batch_loss = tf.nn.ctc_loss( labels=labels, inputs=logits, sequence_length=ilen, time_major=False, preprocess_collapse_repeated=False, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=False) return batch_loss
def compute_sen_lens(inputs, padding_token=0): """ Count how many words in a sentence. inputs: [..., time_steps] sen_lens: [...] """ x_binary = tf.cast(tf.not_equal(inputs, padding_token), tf.int32) sen_lens = tf.reduce_sum(x_binary, axis=-1) ones = tf.ones_like(sen_lens) sen_lens = tf.where(tf.equal(sen_lens, utils.PAD_IDX), x=ones, y=sen_lens) return sen_lens
def _freq_feat_graph(feat_name, **kwargs): winlen = kwargs.get('winlen') winstep = kwargs.get('winstep') feature_size = kwargs.get('feature_size') sr = kwargs.get('sr') #pylint: disable=invalid-name nfft = kwargs.get('nfft') del nfft assert feat_name in ('fbank', 'spec') params = speech_ops.speech_params( sr=sr, bins=feature_size, add_delta_deltas=False, audio_frame_length=winlen, audio_frame_step=winstep) graph = None if feat_name == 'fbank': # get session if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): # fbank filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) del sample_rate fbank = speech_ops.extract_feature(waveforms, params) # shape must be [T, D, C] feat = tf.identity(fbank, name=feat_name) elif feat_name == 'spec': # magnitude spec if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) spec = py_x_ops.spectrum( waveforms[:, 0], tf.cast(sample_rate, tf.dtypes.float32), output_type=1) #output_type: 1, power spec; 2 log power spec spec = tf.sqrt(spec) # shape must be [T, D, C] spec = tf.expand_dims(spec, -1) feat = tf.identity(spec, name=feat_name) else: raise ValueError(f"Not support freq feat: {feat_name}.") return graph, (_get_out_tensor_name('wavpath', 0), _get_out_tensor_name(feat_name, 0))