def pooling_layer(self, x, pooling_type=None): ''' Add a pooling layer across the whole utterance. Input: [B, T, D] --> Reduce along T Statistics pooling output: [B, D * 2] Average pooling output: [B, D] ''' assert_rank3 = tf.debugging.assert_rank(x, 3) with tf.control_dependencies([assert_rank3]): x = tf.identity(x) pooling_type = pooling_type if pooling_type else self.netconf[ 'frame_pooling_type'] if pooling_type == 'stats': with tf.name_scope('stats_pooling'): mean, var = tf.nn.moments(x, 1) x = tf.concat([mean, tf.sqrt(var + 1e-6)], 1) elif pooling_type == 'average': with tf.name_scope('average_pooling'): mean, _ = tf.nn.moments(x, 1) x = mean else: raise ValueError('Unsupported frame_pooling_type: %s' % (pooling_type)) assert_rank2 = tf.debugging.assert_rank(x, 2) with tf.control_dependencies([assert_rank2]): x = tf.identity(x) return x
def curvature_range(self): # set up the curvature window self._curv_win = tf.Variable(np.zeros([ self._curv_win_width, ]), dtype=tf.float32, name="curv_win", trainable=False) # we can use log smoothing for curvature range to follow trend faster # self._curv_win = tf.scatter_update( # self._curv_win, self._global_step % self._curv_win_width, # tf.log(self._grad_norm_squared + EPS)) self._curv_win = tf.scatter_update( self._curv_win, self._global_step % self._curv_win_width, self._grad_norm_squared + EPS) # note here the iterations start from iteration 0 valid_window = tf.slice( self._curv_win, tf.constant([ 0, ]), tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width), self._global_step + 1), dim=0)) if self._h_min_log_smooth: self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS) else: self._h_min_t = tf.reduce_min(valid_window) if self._h_max_log_smooth: self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS) else: self._h_max_t = tf.reduce_max(valid_window) curv_range_ops = [] with tf.control_dependencies([self._h_min_t, self._h_max_t]): avg_op = self._moving_averager.apply( [self._h_min_t, self._h_max_t]) with tf.control_dependencies([avg_op]): if self._h_min_log_smooth: self._h_min = tf.exp( tf.identity( self._moving_averager.average(self._h_min_t))) else: self._h_min = \ tf.identity(self._moving_averager.average(self._h_min_t)) if self._h_max_log_smooth: self._h_max = tf.exp( tf.identity( self._moving_averager.average(self._h_max_t))) else: self._h_max = \ tf.identity(self._moving_averager.average(self._h_max_t)) if self._sparsity_debias: self._h_min = self._h_min * self._sparsity_avg self._h_max = self._h_max * self._sparsity_avg curv_range_ops.append(avg_op) return curv_range_ops
def apply_gradients(self, grads_tvars, global_step=None, name=None): self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars if g is not None]) # for manual gradient clipping if self._clip_thresh_var is not None: self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, self._clip_thresh_var) # loosely adaptive clipping of gradient in case exploding gradient ruins statistics if self._use_adapt_grad_clip: thresh = tf.cond( self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self. _adapt_grad_clip_thresh**2), lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) with tf.variable_scope("before_apply"): before_apply_op = self.before_apply() with tf.variable_scope("update_hyper"): with tf.control_dependencies([before_apply_op]): update_hyper_op = self.update_hyper_param() with tf.variable_scope("apply_updates"): with tf.control_dependencies([update_hyper_op]): # clip exploding gradient according to h_max if self._use_adapt_grad_clip: thresh = tf.cond( tf.greater(tf.global_norm(self._grads), self._adapt_grad_clip_thresh), lambda: self._adapt_grad_clip_target_val, lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) apply_grad_op = self._optimizer.apply_gradients( zip(self._grads, self._tvars), global_step, name) with tf.control_dependencies([apply_grad_op]): self._increment_global_step_op = tf.assign(self._global_step, self._global_step + 1) self._adapt_grad_clip_thresh_op = \ tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) ) self._adapt_grad_clip_target_val_op = \ tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) ) # self._adapt_grad_clip_target_val_op = \ # tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min))) return tf.group(before_apply_op, update_hyper_op, apply_grad_op, self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op, self._increment_global_step_op)
def call(self, audio_data, sample_rate=None): """ Caculate fbank && pitch(concat) features of wav. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: the samplerate of the signal we working with. :return: A tensor with shape (num_frames, dim_features), containing fbank && pitch feature of every frame in speech. """ p = self.config with tf.name_scope('fbank_pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate)) pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate)) fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1) return fbank_pitch_feats
def call(self, audio_data, sample_rate=None): """ Caculate pitch features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech. """ p = self.config with tf.name_scope('pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): pitch = py_x_ops.pitch(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, thres_autoc=p.thres_autoc) pitch = tf.squeeze(pitch) pitch = tf.transpose(pitch[None, :]) return pitch
def grad_variance(self): grad_var_ops = [] tensor_to_avg = [] for t, g in zip(self._tvars, self._grads): if isinstance(g, ops.IndexedSlices): tensor_to_avg.append( tf.reshape(tf.unsorted_segment_sum(g.values, g.indices, g.dense_shape[0]), shape=t.get_shape())) else: tensor_to_avg.append(g) avg_op = self._moving_averager.apply(tensor_to_avg) grad_var_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_avg = [ self._moving_averager.average(val) for val in tensor_to_avg ] self._grad_avg_squared = [tf.square(val) for val in self._grad_avg] self._grad_var = tf.maximum( tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype), self._grad_norm_squared_avg - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared])) if self._sparsity_debias: self._grad_var *= self._sparsity_avg return grad_var_ops
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum and phase spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: Two returns: power spectrum —— A float tensor of size (num_frames, num_frequencies) containing power spectrum and of every frame in speech. phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing phase spectrum and of every frame in speech. """ p = self.config with tf.name_scope('analyfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) power_spectrum, phase_spectrum = py_x_ops.analyfiltbank( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return power_spectrum, phase_spectrum
def call(self, audio_data, sample_rate=None): """ Caculate mfcc features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing mfcc features of every frame in speech. """ p = self.config with tf.name_scope('mfcc'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum_feats = self.spect(audio_data, sample_rate) spectrum_feats = tf.expand_dims(spectrum_feats, 0) fbank_feats = self.fbank(audio_data, sample_rate) mfcc = py_x_ops.mfcc(fbank_feats, spectrum_feats, sample_rate, use_energy=p.use_energy, cepstral_lifter=p.cepstral_lifter, coefficient_count=p.coefficient_count) return mfcc
def call(self, audio_data, sample_rate=None): """ Caculate cepstrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech. """ p = self.config with tf.name_scope('cepstrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): cepstrum = py_x_ops.cepstrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, ceps_subband_num=p.ceps_subband_num, tag_ceps_mean_norm=p.tag_ceps_mean_norm) return cepstrum
def call(self, audio_data, sample_rate=None): """ Caculate plp features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech. """ p = self.config with tf.name_scope('plp'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) plp = py_x_ops.plp(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, plp_order=p.plp_order) return plp
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1) or log power spectrum (output_type=2) of every frame in speech. """ p = self.config with tf.name_scope('spectrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) spectrum = py_x_ops.spectrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, output_type=p.output_type, snip_edges=p.snip_edges, raw_energy=p.raw_energy, preEph_coeff=p.preeph_coeff, window_type=p.window_type, remove_dc_offset=p.remove_dc_offset, is_fbank=p.is_fbank) return spectrum
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size N containing add-noise audio. """ p = self.config with tf.name_scope('add_rir_noise_aecres'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres( audio_data, sample_rate, if_add_rir=p.if_add_rir, rir_filelist=p.rir_filelist, if_add_noise=p.if_add_noise, snr_min=p.snr_min, snr_max=p.snr_max, noise_filelist=p.noise_filelist, if_add_aecres=p.if_add_aecres, aecres_filelist=p.aecres_filelist) return tf.squeeze(add_rir_noise_aecres_out)
def call(self, audio_data, sample_rate=None): """ Caculate fbank features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing fbank features of every frame in speech. """ p = self.config with tf.name_scope('fbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) if p.upper_frequency_limit <= 0: p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit elif (p.upper_frequency_limit <= p.lower_frequency_limit) or ( p.upper_frequency_limit > p.sample_rate / 2.0): p.upper_frequency_limit = p.sample_rate / 2.0 assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum = self.spect(audio_data, sample_rate) spectrum = tf.expand_dims(spectrum, 0) fbank = py_x_ops.fbank( spectrum, sample_rate, upper_frequency_limit=p.upper_frequency_limit, lower_frequency_limit=p.lower_frequency_limit, filterbank_channel_count=p.filterbank_channel_count) return fbank
def call(self, power_spectrum, phase_spectrum, sample_rate=None): """ Implement frequency domain to time domain conversion. :param power_spectrum: a float tensor of size (num_frames, num_frequencies). :param phase_spectrum: a float tensor of size (num_frames, num_frequencies). :param sample_rate: a scalar tensor. :return: audio data """ p = self.config with tf.name_scope('synthfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): audio_data = py_x_ops.synthfiltbank( power_spectrum, phase_spectrum, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return audio_data
def call(self, audio_data, sample_rate=None): """ Calculate the zero-crossing rate of speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech. """ p = self.config with tf.name_scope('zcr'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) zcr = py_x_ops.zcr(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return zcr
def call(self, audio_data, sample_rate=None): """ Caculate power of every frame in speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (1, num_frames) containing power of every frame in speech. """ p = self.config with tf.name_scope('framepow'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): framepow = py_x_ops.frame_pow( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return framepow
def splice(feat, left_context, right_context): ''' splice frame with context param: feat, tf.float32, [batch, time, feat] return: feat, tf.float32, [batch, time, feat*(left_context + 1 + right_context)] reference: https://github.com/kaldi-asr/kaldi/src/feat/feature-functions.cc#L205:6 ''' def _loop_continue(time, end_time, context, unused_left_context, right_context, unused_output_tas): del unused_output_tas del unused_left_context return time < end_time def _loop_body(time, end_time, context, left_context, right_context, output_tas): shape = tf.shape(context) B, _, D = shape[0], shape[1], shape[2] N = (1 + left_context + right_context) * D new_feat = context[:, time:time + left_context + 1 + right_context, :] new_feat = tf.reshape(new_feat, [B, N]) new_output_tas = output_tas.write(time, new_feat) return (time + 1, end_time, context, left_context, right_context, new_output_tas) with tf.control_dependencies([ tf.assert_greater_equal(left_context, 0), tf.assert_greater_equal(right_context, 0) ]): T = tf.shape(feat)[1] output_tas = _new_tensor_array('splice_feat_ta', T, dtype=tf.float32) time = tf.constant(0, tf.int32) first = tf.tile(feat[:, 0:1, :], [1, left_context, 1]) last = tf.tile(feat[:, -1:, :], [1, right_context, 1]) context = tf.concat([first, feat], axis=1) context = tf.concat([context, last], axis=1) loop_vars = (time, T, context, left_context, right_context, output_tas) parallel_iterations = 10 shape_invariants = tf.nest.map_structure( lambda t: tf.TensorShape(None), loop_vars) (time, end_time, context, left_context, right_context, output_tas) = tf.while_loop(_loop_continue, _loop_body, loop_vars=loop_vars, shape_invariants=shape_invariants, parallel_iterations=parallel_iterations, swap_memory=False) del context del left_context del right_context batch_spliced_feats = output_tas.stack() batch_spliced_feats = tf.transpose(batch_spliced_feats, [1, 0, 2]) return batch_spliced_feats
def before_apply(self): self._moving_averager = tf.train.ExponentialMovingAverage( decay=self._beta, zero_debias=self._zero_debias) assert self._grads is not None and len(self._grads) > 0 before_apply_ops = [] # get per var g**2 and norm**2 self._grad_squared = [] self._grad_norm_squared = [] for v, g in zip(self._tvars, self._grads): if g is None: continue with ops.colocate_with(v): self._grad_squared.append(tf.square(g)) self._grad_norm_squared = [ tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared ] if self._sparsity_debias: avg_op_sparsity = self.grad_sparsity() before_apply_ops.append(avg_op_sparsity) # the following running average on squared norm of gradient is shared # by `grad_variance` and `dist_to_opt` avg_op = self._moving_averager.apply(self._grad_norm_squared) with tf.control_dependencies([avg_op]): self._grad_norm_squared_avg = [ self._moving_averager.average(val) for val in self._grad_norm_squared ] self._grad_norm_squared = tf.add_n(self._grad_norm_squared) self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg) before_apply_ops.append(avg_op) with tf.control_dependencies([avg_op]): curv_range_ops = self.curvature_range() before_apply_ops += curv_range_ops grad_var_ops = self.grad_variance() before_apply_ops += grad_var_ops dist_to_opt_ops = self.dist_to_opt() before_apply_ops += dist_to_opt_ops return tf.group(*before_apply_ops)
def get_train_op(self, loss, global_step=None): """Get the training operator.""" apply_gradient_op = self.get_apply_gradients_op(loss, global_step) # model average self.var_avg(global_step) # model average after apply gradients with tf.control_dependencies([apply_gradient_op]): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(*update_ops) utils.log_vars('moving vars', tf.moving_average_variables()) return train_op
def dist_to_opt(self): dist_to_opt_ops = [] # running average of the norm of gradeint self._grad_norm = tf.sqrt(self._grad_norm_squared) avg_op = self._moving_averager.apply([ self._grad_norm, ]) dist_to_opt_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_norm_avg = self._moving_averager.average( self._grad_norm) # single iteration distance estimation # note that self._grad_norm_avg is per variable self._dist_to_opt = (self._grad_norm_avg / (self._grad_norm_squared_avg + EPS)) # running average of distance avg_op = self._moving_averager.apply([self._dist_to_opt]) dist_to_opt_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._dist_to_opt_avg = tf.identity( self._moving_averager.average(self._dist_to_opt)) if self._sparsity_debias: self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS) return dist_to_opt_ops
def update_hyper_param(self): assign_hyper_ops = [] self._mu = tf.identity( tf.cond(self._do_tune, lambda: self.get_mu_tensor(), lambda: self._mu_var)) with tf.control_dependencies([self._mu]): self._lr = tf.identity( tf.cond(self._do_tune, lambda: self.get_lr_tensor(), lambda: self._lr_var)) with tf.control_dependencies([self._mu, self._lr]): if self._use_unsmoothed_lr_mu: assign_hyper_ops.append(tf.assign(self._mu_var, self._mu)) assign_hyper_ops.append(tf.assign(self._lr_var, self._lr)) else: self._mu = self._beta * self._mu_var + (1 - self._beta) * self._mu self._lr = self._beta * self._lr_var + (1 - self._beta) * self._lr with tf.control_dependencies([self._mu, self._lr]): assign_hyper_ops.append(tf.assign(self._mu_var, self._mu)) assign_hyper_ops.append(tf.assign(self._lr_var, self._lr)) assign_hyper_op = tf.group(*assign_hyper_ops) return assign_hyper_op
def accuracy(logits, labels): ''' accuracy candies params: logits: [B, ..., D] labels: [B, ...] return: accuracy tensor ''' with tf.name_scope('accuracy'): assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1) assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels)) with tf.control_dependencies([assert_rank, assert_shape]): predictions = tf.argmax(logits, axis=-1, output_type=tf.int64) labels = tf.cast(labels, tf.int64) return tf.reduce_mean( tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
def call(self, wavfile): """ Get audio data and sample rate from a wavfile. :param wavfile: filepath of wav :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav file, which is a tensor with float dtype. """ p = self.config contents = tf.io.read_file(wavfile) audio_data, sample_rate = tf.audio.decode_wav( contents, desired_channels=p.audio_channels) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate, dtype=float)
def grad_sparsity(self): # If the sparse minibatch gradient has 10 percent of its entries # non-zero, its sparsity is 0.1. # The norm of dense gradient averaged from full dataset # are roughly estimated norm of minibatch # sparse gradient norm * sqrt(sparsity) # An extension maybe only correct the sparse blob. non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads]) all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads]) self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \ / tf.cast(all_entry_cnt, self._grads[0].dtype) avg_op = self._moving_averager.apply([ self._sparsity, ]) with tf.control_dependencies([avg_op]): self._sparsity_avg = self._moving_averager.average(self._sparsity) return avg_op
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0): ''' ctc loss function psram: logits, (B, T, D) psram: input_length, (B, 1), input length of encoder psram: labels, (B, T) psram: label_length, (B, 1), label length for convert dense label to sparse returns: loss, scalar ''' ilen = tf.cond( pred=tf.equal(tf.rank(input_length), 1), true_fn=lambda: input_length, false_fn=lambda: tf.squeeze(input_length), ) ilen = tf.cast(ilen, tf.int32) olen = tf.cond( pred=tf.equal(tf.rank(label_length), 1), true_fn=lambda: label_length, false_fn=lambda: tf.squeeze(label_length)) olen = tf.cast(olen, tf.int32) deps = [ tf.assert_rank(labels, 2, name='label_rank_check'), tf.assert_rank(logits, 3, name='logits_rank_check'), tf.assert_rank(ilen, 1, name='src_len_rank_check'), # input_length tf.assert_rank(olen, 1, name='tgt_len_rank_check'), # output_length ] labels, logits = ctc_data_transform(labels, logits, blank_index) with tf.control_dependencies(deps): # (B, 1) # blank index is consistent with Espnet, zero batch_loss = tf.nn.ctc_loss( labels=labels, inputs=logits, sequence_length=ilen, time_major=False, preprocess_collapse_repeated=False, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=False) return batch_loss
def get_train_op(self, loss, multitask, global_step=None): """Get the training operator.""" # quantize training quantconf = self.config['solver']['quantization'] quantization = quantconf['enable'] if quantization: quant_delay = quantconf['quant_delay'] logging.info('Quantization training with {} delay'.format(quant_delay)) tf.contrib.quantize.create_training_graph(quant_delay=quant_delay) apply_gradient_op = self.get_apply_gradients_op(loss, multitask, global_step) # model average self.var_avg(global_step) # model average after apply gradients with tf.control_dependencies([apply_gradient_op]): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(*update_ops) utils.log_vars('moving vars', tf.moving_average_variables()) return train_op
def call(self, filename, audio_data, sample_rate=None): """ Write wav using audio_data[tensor]. :param filename: filepath of wav. :param audio_data: a tensor containing data of a wav. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: write wav opration. """ p = self.config filename = tf.constant(filename) if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): audio_data = tf.cast(audio_data, dtype=tf.float32) contents = tf.audio.encode_wav( tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32)) w = tf.io.write_file(filename, contents) return w