def _create_topk_unique(inputs, k): """Creates the top k values in sorted order with indices.""" height = inputs.shape[0] width = inputs.shape[1] neg_inf_r0 = tf.constant(-np.inf, dtype=tf.float32) ones = tf.ones([height, width], dtype=tf.float32) neg_inf_r2 = ones * neg_inf_r0 inputs = tf.where(tf.is_nan(inputs), neg_inf_r2, inputs) tmp = inputs topk_r2 = tf.zeros([height, k], dtype=tf.float32) for i in range(k): kth_order_statistic = tf.reduce_max(tmp, axis=1, keepdims=True) k_mask = tf.tile( tf.expand_dims(tf.equal(tf.range(k), tf.fill([k], i)), 0), [height, 1]) topk_r2 = tf.where(k_mask, tf.tile(kth_order_statistic, [1, k]), topk_r2) ge_r2 = tf.greater_equal(inputs, tf.tile(kth_order_statistic, [1, width])) tmp = tf.where(ge_r2, neg_inf_r2, inputs) log2_ceiling = int(math.ceil(math.log(float(int(width)), 2))) next_power_of_two = 1 << log2_ceiling count_mask = next_power_of_two - 1 mask_r0 = tf.constant(count_mask) mask_r2 = tf.fill([height, k], mask_r0) topk_r2_s32 = tf.bitcast(topk_r2, tf.int32) topk_indices_r2 = tf.bitwise.bitwise_and(topk_r2_s32, mask_r2) return topk_r2, topk_indices_r2
def test_labels_blankid_to_last(self): ''' unit test case for the labels_blankid_to_last interface ''' with self.cached_session(): with self.assertRaises(AssertionError) as assert_err: labels = ctc_utils.labels_blankid_to_last(labels=self.labels, blank_index=0, num_class=None) the_exception = assert_err.exception self.assertEqual(str(the_exception), 'The num_class should not be None!') labels = ctc_utils.labels_blankid_to_last(labels=tf.constant( self.labels), blank_index=0, num_class=6) labels_values = np.asarray([0, 0, 0, 2, 0, 0, 0]) labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2]]) labels_shape = np.asarray([2, 4]) self.assertAllEqual(labels.eval().values, labels_values) self.assertAllEqual(labels.eval().indices, labels_index) self.assertAllEqual(labels.eval().dense_shape, labels_shape) labels = ctc_utils.labels_blankid_to_last(labels=tf.constant( self.labels), blank_index=2, num_class=6) labels_values = np.asarray([1, 1, 1, 2, 1, 1, 1]) labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2]]) labels_shape = np.asarray([2, 4]) self.assertAllEqual(labels.eval().values, labels_values) self.assertAllEqual(labels.eval().indices, labels_index) self.assertAllEqual(labels.eval().dense_shape, labels_shape)
def call(self, audio_data, sample_rate=None): """ Caculate fbank && pitch(concat) features of wav. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: the samplerate of the signal we working with. :return: A tensor with shape (num_frames, dim_features), containing fbank && pitch feature of every frame in speech. """ p = self.config with tf.name_scope('fbank_pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate)) pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate)) fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1) return fbank_pitch_feats
def call(self, audio_data, sample_rate=None): """ Caculate pitch features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech. """ p = self.config with tf.name_scope('pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): pitch = py_x_ops.pitch(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, thres_autoc=p.thres_autoc) pitch = tf.squeeze(pitch) pitch = tf.transpose(pitch[None, :]) return pitch
def call(self, power_spectrum, phase_spectrum, sample_rate=None): """ Implement frequency domain to time domain conversion. :param power_spectrum: a float tensor of size (num_frames, num_frequencies). :param phase_spectrum: a float tensor of size (num_frames, num_frequencies). :param sample_rate: a scalar tensor. :return: audio data """ p = self.config with tf.name_scope('synthfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): audio_data = py_x_ops.synthfiltbank( power_spectrum, phase_spectrum, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return audio_data
def call(self, audio_data, sample_rate=None): """ Caculate fbank features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing fbank features of every frame in speech. """ p = self.config with tf.name_scope('fbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) if p.upper_frequency_limit <= 0: p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit elif (p.upper_frequency_limit <= p.lower_frequency_limit) or ( p.upper_frequency_limit > p.sample_rate / 2.0): p.upper_frequency_limit = p.sample_rate / 2.0 assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum = self.spect(audio_data, sample_rate) spectrum = tf.expand_dims(spectrum, 0) fbank = py_x_ops.fbank( spectrum, sample_rate, upper_frequency_limit=p.upper_frequency_limit, lower_frequency_limit=p.lower_frequency_limit, filterbank_channel_count=p.filterbank_channel_count) return fbank
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1) or log power spectrum (output_type=2) of every frame in speech. """ p = self.config with tf.name_scope('spectrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) spectrum = py_x_ops.spectrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, output_type=p.output_type, snip_edges=p.snip_edges, raw_energy=p.raw_energy, preEph_coeff=p.preeph_coeff, window_type=p.window_type, remove_dc_offset=p.remove_dc_offset, is_fbank=p.is_fbank) return spectrum
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum and phase spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: Two returns: power spectrum —— A float tensor of size (num_frames, num_frequencies) containing power spectrum and of every frame in speech. phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing phase spectrum and of every frame in speech. """ p = self.config with tf.name_scope('analyfiltbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) power_spectrum, phase_spectrum = py_x_ops.analyfiltbank( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return power_spectrum, phase_spectrum
def call(self, audio_data, sample_rate=None): """ Calculate the zero-crossing rate of speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech. """ p = self.config with tf.name_scope('zcr'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) zcr = py_x_ops.zcr(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return zcr
def test_labels_last_to_blankid(self): ''' unit test case for the labels_last_to_blankid interface ''' with self.cached_session(): labels = ctc_utils.labels_last_to_blankid(labels=tf.constant( self.labels), blank_index=0, num_class=None) labels_values = np.asarray([2, 2, 2, 4, 2, 2, 2]) labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2]]) labels_shape = np.asarray([2, 4]) self.assertAllEqual(labels.eval().values, labels_values) self.assertAllEqual(labels.eval().indices, labels_index) self.assertAllEqual(labels.eval().dense_shape, labels_shape) labels = ctc_utils.labels_last_to_blankid(labels=tf.constant( self.labels), blank_index=2, num_class=None) labels_values = np.asarray([1, 1, 1, 4, 1, 1, 1]) labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2]]) labels_shape = np.asarray([2, 4]) self.assertAllEqual(labels.eval().values, labels_values) self.assertAllEqual(labels.eval().indices, labels_index) self.assertAllEqual(labels.eval().dense_shape, labels_shape)
def generate_synthetic_data(input_shape, input_value=0, input_dtype=None, label_shape=None, label_value=0, label_dtype=None, nepoch=None): """Create a repeating dataset with constant values. Args: input_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of the input data. input_value: Value of each input element. input_dtype: Input dtype. If None, will be inferred by the input value. label_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of the label data. label_value: Value of each input element. label_dtype: Input dtype. If None, will be inferred by the target value. nepoch: num of epochs. If None, will repeat forever. Returns: Dataset of tensors or tuples of tensors (if label_shape is set). """ # TODO(kathywu): Replace with SyntheticDataset once it is in contrib. element = input_element = nest.map_structure( lambda s: tf.constant(input_value, input_dtype, s), input_shape) if label_shape: label_element = nest.map_structure( lambda s: tf.constant(label_value, label_dtype, s), label_shape) element = (input_element, label_element) return tf.data.Dataset.from_tensors(element).repeat(nepoch)
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size N containing add-noise audio. """ p = self.config with tf.name_scope('add_rir_noise_aecres'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres( audio_data, sample_rate, if_add_rir=p.if_add_rir, rir_filelist=p.rir_filelist, if_add_noise=p.if_add_noise, snr_min=p.snr_min, snr_max=p.snr_max, noise_filelist=p.noise_filelist, if_add_aecres=p.if_add_aecres, aecres_filelist=p.aecres_filelist) return tf.squeeze(add_rir_noise_aecres_out)
def call(self, audio_data, sample_rate=None): """ Caculate mfcc features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing mfcc features of every frame in speech. """ p = self.config with tf.name_scope('mfcc'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum_feats = self.spect(audio_data, sample_rate) spectrum_feats = tf.expand_dims(spectrum_feats, 0) fbank_feats = self.fbank(audio_data, sample_rate) mfcc = py_x_ops.mfcc(fbank_feats, spectrum_feats, sample_rate, use_energy=p.use_energy, cepstral_lifter=p.cepstral_lifter, coefficient_count=p.coefficient_count) return mfcc
def call(self, audio_data, sample_rate=None): """ Caculate power of every frame in speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (1, num_frames) containing power of every frame in speech. """ p = self.config with tf.name_scope('framepow'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): framepow = py_x_ops.frame_pow( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length) return framepow
def call(self, audio_data, sample_rate=None): """ Caculate cepstrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech. """ p = self.config with tf.name_scope('cepstrum'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): cepstrum = py_x_ops.cepstrum( audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, ceps_subband_num=p.ceps_subband_num, tag_ceps_mean_norm=p.tag_ceps_mean_norm) return cepstrum
def call(self, audio_data, sample_rate=None): """ Caculate plp features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech. """ p = self.config with tf.name_scope('plp'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) plp = py_x_ops.plp(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, plp_order=p.plp_order) return plp
def get_learning_rate(self): """Get the learning rate.""" lrconf = self.config['solver']['optimizer']['learning_rate'] learning_rate = lrconf['rate'] learning_type = lrconf['type'] #pylint: disable=invalid-name if learning_type == 'exp_decay': lr = tf.train.exponential_decay( learning_rate, tf.train.get_or_create_global_step(), lrconf['decay_steps'], lrconf['decay_rate'], staircase=True) elif learning_type == 'piecewise': #boundaries = [15000, 30000] #values = [1e-3, 1e-4, 1e-5] boundaries = lrconf['boundaries'] values = lrconf['values'] assert len(values) == len( boundaries) + 1, 'values len must equal boundaries len plus one' lr = tf.train.piecewise_constant( tf.train.get_or_create_global_step(), boundaries=boundaries, values=values) elif learning_type == 'warmup': learning_rate = tf.constant( value=learning_rate, shape=[], dtype=tf.float32) global_step = tf.train.get_or_create_global_step() data_size = self.config['data']['train_data_size'] num_epochs = self.config["data"]["task"]['epochs'] batch_size = self.config["data"]["task"]['batch_size'] num_batch = int(math.ceil(data_size * num_epochs / batch_size)) learning_rate = tf.train.polynomial_decay( learning_rate, global_step, num_batch, end_learning_rate=0.0, power=1.0, cycle=False) global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(lrconf['num_warmup_steps'], dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = learning_rate * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) lr = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) elif learning_type == 'const': lr = learning_rate else: raise ValueError( "Not support learning rate type: {}".format(learning_type)) tf.summary.scalar('lr', lr) return lr
def begin(self): self._global_step_tensor = tf.train.get_or_create_global_step() if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use StopAtStepHook.") self._epoch_tensor = (self._global_step_tensor * tf.constant( self._num_examples_per_epoch)) / tf.constant( self._global_batch_size)
def curvature_range(self): # set up the curvature window self._curv_win = tf.Variable(np.zeros([ self._curv_win_width, ]), dtype=tf.float32, name="curv_win", trainable=False) # we can use log smoothing for curvature range to follow trend faster # self._curv_win = tf.scatter_update( # self._curv_win, self._global_step % self._curv_win_width, # tf.log(self._grad_norm_squared + EPS)) self._curv_win = tf.scatter_update( self._curv_win, self._global_step % self._curv_win_width, self._grad_norm_squared + EPS) # note here the iterations start from iteration 0 valid_window = tf.slice( self._curv_win, tf.constant([ 0, ]), tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width), self._global_step + 1), dim=0)) if self._h_min_log_smooth: self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS) else: self._h_min_t = tf.reduce_min(valid_window) if self._h_max_log_smooth: self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS) else: self._h_max_t = tf.reduce_max(valid_window) curv_range_ops = [] with tf.control_dependencies([self._h_min_t, self._h_max_t]): avg_op = self._moving_averager.apply( [self._h_min_t, self._h_max_t]) with tf.control_dependencies([avg_op]): if self._h_min_log_smooth: self._h_min = tf.exp( tf.identity( self._moving_averager.average(self._h_min_t))) else: self._h_min = \ tf.identity(self._moving_averager.average(self._h_min_t)) if self._h_max_log_smooth: self._h_max = tf.exp( tf.identity( self._moving_averager.average(self._h_max_t))) else: self._h_max = \ tf.identity(self._moving_averager.average(self._h_max_t)) if self._sparsity_debias: self._h_min = self._h_min * self._sparsity_avg self._h_max = self._h_max * self._sparsity_avg curv_range_ops.append(avg_op) return curv_range_ops
def apply_gradients(self, grads_tvars, global_step=None, name=None): self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars if g is not None]) # for manual gradient clipping if self._clip_thresh_var is not None: self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, self._clip_thresh_var) # loosely adaptive clipping of gradient in case exploding gradient ruins statistics if self._use_adapt_grad_clip: thresh = tf.cond( self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self. _adapt_grad_clip_thresh**2), lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) with tf.variable_scope("before_apply"): before_apply_op = self.before_apply() with tf.variable_scope("update_hyper"): with tf.control_dependencies([before_apply_op]): update_hyper_op = self.update_hyper_param() with tf.variable_scope("apply_updates"): with tf.control_dependencies([update_hyper_op]): # clip exploding gradient according to h_max if self._use_adapt_grad_clip: thresh = tf.cond( tf.greater(tf.global_norm(self._grads), self._adapt_grad_clip_thresh), lambda: self._adapt_grad_clip_target_val, lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) apply_grad_op = self._optimizer.apply_gradients( zip(self._grads, self._tvars), global_step, name) with tf.control_dependencies([apply_grad_op]): self._increment_global_step_op = tf.assign(self._global_step, self._global_step + 1) self._adapt_grad_clip_thresh_op = \ tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) ) self._adapt_grad_clip_target_val_op = \ tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) ) # self._adapt_grad_clip_target_val_op = \ # tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min))) return tf.group(before_apply_op, update_hyper_op, apply_grad_op, self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op, self._increment_global_step_op)
def on_epoch_end(self, epoch, logs={}): '''computing token error''' cur_session = K.get_session() target_seq_list, predict_seq_list = [], [] is_py_sequence = True if isinstance(self.eval_ds, (dataset_ops.DatasetV2, dataset_ops.DatasetV1)): eval_gen = self.eval_ds.make_one_shot_iterator() self.next_batch_gen = eval_gen.get_next()[0] is_py_sequence = False elif isinstance(self.eval_ds, (iterator_ops.IteratorV2, iterator_ops.Iterator)): self.next_batch_gen = self.ds.get_next()[0] is_py_sequence = False for index in range(len(self.eval_task)): batch_data = None if is_py_sequence: batch_data = self.eval_ds[index][0] else: batch_data = cur_session.run(self.next_batch_gen) batch_input = batch_data['inputs'] batch_target = batch_data['targets'].tolist() batch_predict = self.func(batch_input)[0] if self.decoder_type == 'argmax': predict_seq_list += py_ctc.ctc_greedy_decode(batch_predict, 0, unique=True) else: sequence_lens = [ len(pre_sequence) for pre_sequence in batch_predict ] batch_decoder, _ = tf_ctc.ctc_beam_search_decode( tf.constant(batch_predict), tf.constant(sequence_lens), beam_width=3, top_paths=3) predict_seq_list += cur_session.run(batch_decoder)[0].tolist() target_seq_list += batch_target val_token_errors = metrics_lib.token_error( predict_seq_list=predict_seq_list, target_seq_list=target_seq_list, eos_id=0) logs['val_token_err'] = val_token_errors if 'val_loss' in logs: logging.info("Epoch {}: on eval, val_loss is {}.".format( epoch + 1, logs['val_loss'])) logging.info("Epoch {}: on eval, token_err is {}.".format( epoch + 1, val_token_errors)) logging.info("Epoch {}: loss on train is {}".format( epoch + 1, logs['loss']))
def test_ctc_beam_search_decode(self): ''' ctc tensorflow beam search unittest''' with self.cached_session(): decode_result, _ = tf_ctc.ctc_beam_search_decode( tf.constant(self.logits), tf.constant(self.sequence_lens), beam_width=1, top_paths=1) self.assertAllEqual(decode_result[0].eval(), [[1], [1]])
def test_logits_blankid_to_last(self): ''' unit test case for the logits_blankid_to_last interface ''' with self.cached_session(): with self.assertRaises(ValueError) as valueErr: logits = ctc_utils.logits_blankid_to_last(logits=tf.constant( self.logits), blank_index=10) the_exception = valueErr.exception self.assertEqual( str(the_exception), 'blank_index must be less than or equal to num_class - 1') logits = ctc_utils.logits_blankid_to_last(logits=tf.constant( self.logits), blank_index=0) logits_transform = np.asarray([ [[ 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, 0.633766 ], [ 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436, 0.111121 ], [ 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688, 0.0357786 ], [ 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533, 0.0663296 ], [ 0.196634, 0.123377, 0.50648837, 0.00903441, 0.00623107, 0.158235 ]], [[0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, 0.30176], [0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, 0.24082], [ 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456, 0.230246 ], [ 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345, 0.280884 ], [ 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046, 0.423286 ]] ], dtype=np.float32) self.assertAllClose(logits.eval(), logits)
def pad_tail_tensor(): if dim == 1: shape = tf.constant([1, 2]) indices = tf.constant([[0, 1]]) else: shape = tf.constant([2, 2]) indices = tf.constant([[1, 1]]) updates = [new_length - cur_length] paddings = tf.scatter_nd(indices, updates, shape) new_t = tf.pad(origin_t, paddings, "CONSTANT", constant_values=padding_token) return new_t
def test_crf_loss(self): ''' test crf loss ''' with self.cached_session(): loss_true = np.float32(5.5096426) logits = np.asarray([[[0.3, 0.4, 0.3], [0.1, 0.9, 0.0], [0.2, 0.7, 0.1], [0.3, 0.2, 0.5], [0.6, 0.2, 0.2]]], dtype=np.float32) # [1,5,3] trans_params = tf.fill([3, 3], 0.5, name='trans_params') labels = np.asarray([[0, 1, 2, 0, 1]], dtype=np.int32) # shape=[1,5] sequence_lengths = np.asarray([5], dtype=np.int32) # shape=[1,] loss, _ = loss_utils.crf_log_likelihood( tf.constant(logits), tf.constant(labels), tf.constant(sequence_lengths), trans_params) self.assertEqual(loss.eval(), loss_true)
def get_lr_tensor(self): lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS) lr = tf.minimum( lr, lr * (tf.to_float(self._global_step) + 1.0) / 10.0 / tf.to_float(tf.constant(self._curv_win_width))) return lr
def grad_variance(self): grad_var_ops = [] tensor_to_avg = [] for t, g in zip(self._tvars, self._grads): if isinstance(g, ops.IndexedSlices): tensor_to_avg.append( tf.reshape(tf.unsorted_segment_sum(g.values, g.indices, g.dense_shape[0]), shape=t.get_shape())) else: tensor_to_avg.append(g) avg_op = self._moving_averager.apply(tensor_to_avg) grad_var_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_avg = [ self._moving_averager.average(val) for val in tensor_to_avg ] self._grad_avg_squared = [tf.square(val) for val in self._grad_avg] self._grad_var = tf.maximum( tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype), self._grad_norm_squared_avg - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared])) if self._sparsity_debias: self._grad_var *= self._sparsity_avg return grad_var_ops
def test_delta_delta(self): ''' test add delta detlas ''' #pylint: disable=invalid-name p = tffeat.speech_params(sr=self.sr_true, bins=40, cmvn=False, audio_desired_samples=1000, add_delta_deltas=False) with self.cached_session(use_gpu=False, force_gpu=False): wavfile = tf.constant(self.wavpath) audio, sample_rate = tffeat.read_wav(wavfile, self.hp) del sample_rate feature = tffeat.compute_mel_filterbank_features( audio, sample_rate=p.audio_sample_rate, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=False) feature = tffeat.delta_delta(feature, order=2) self.assertEqual(feature.eval().shape, (11, 40, 3))
def compute_mfcc(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['upper_frequency_limit'] = float(args.upper_frequency_limit) config['lower_frequency_limit'] = float(args.lower_frequency_limit) config['filterbank_channel_count'] = float(args.filterbank_channel_count) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['output_type'] = args.output_type config['window_type'] = args.window_type config['snip_edges'] = args.snip_edges config['preeph_coeff'] = args.preeph_coeff config['remove_dc_offset'] = args.remove_dc_offset config['is_fbank'] = args.is_fbank config['cepstral_lifter'] = args.cepstral_lifter config['coefficient_count'] = args.coefficient_count mfcc = Mfcc.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate)) sess = tf.Session() mfcc_feats = mfcc_test.eval(session=sess) writer[utt_id] = mfcc_feats
def compute_spectrum(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = float(args.sample_rate) config['output_type'] = int(args.output_type) config['window_length'] = args.window_length config['frame_length'] = args.frame_length spectrum = Spectrum.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) spectrum_test = spectrum(audio_data, args.sample_rate) sess = tf.compat.v1.Session() spectrum_feats = spectrum_test.eval(session=sess) writer[utt_id] = spectrum_feats