def extract_feature(waveforms, params): '''extract fbank with delta-delta and do cmvn waveforms: [batch, samples] ''' p = params with tf.variable_scope('feature_extractor'): mel_fbanks = extract_logfbank_with_delta(waveforms, params) # shape: [1, nframes, nbins, nchannels] fbank_size = utils.shape_list(mel_fbanks) #assert fbank_size[0] == 1 # This replaces CMVN estimation on data if not p.audio_global_cmvn: mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) else: assert p.audio_cmvn_path, p.audio_cmvn_path mean, variance = utils.load_cmvn(p.audio_cmvn_path) var_epsilon = 1e-09 mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon) # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. batch_size = fbank_size[0] feats = tf.concat([ tf.reshape( mel_fbanks, [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2], fbank_size[3])) ], 1) return feats # shape [batch_size, nframes, featue_size, chnanels]
def se_moudle(self, x, channels, reduction, name=''): input_t = x x = tf.reduce_mean(x, [1, 2], name=name + '_avg', keep_dims=True) x = tf.layers.conv2d( x, channels // reduction, (1, 1), use_bias=False, name=name + '_1x1_down', strides=(1, 1), padding='valid', data_format='channels_last', activation=None, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=tf.zeros_initializer()) x = tf.nn.relu(x, name=name + '_1x1_down_relu') x = tf.layers.conv2d( x, channels, (1, 1), use_bias=False, name=name + '_1x1_up', strides=(1, 1), padding='valid', data_format='channels_last', activation=None, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=tf.zeros_initializer()) x = tf.nn.sigmoid(x, name=name + '_1x1_up_sigmoid') return tf.multiply(input_t, x, name=name + '_mul')
def accuracy(logits, labels): ''' accuracy candies params: logits: [B, ..., D] labels: [B, ...] return: accuracy tensor ''' with tf.name_scope('accuracy'): assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1) assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels)) with tf.control_dependencies([assert_rank, assert_shape]): predictions = tf.argmax(logits, axis=-1, output_type=tf.int64) labels = tf.cast(labels, tf.int64) return tf.reduce_mean( tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
def crf_log_likelihood(tags_scores, labels, input_length, transitions): ''' :param tags_scores: [batch_size, max_seq_len, num_tags] :param labels: [batch_size, max_seq_len] :param input_length: [batch_size,] :param transitions: [num_tags, num_tags] :return: loss, transition_params ''' log_likelihood, transition_params = tfa.text.crf_log_likelihood( inputs=tags_scores, tag_indices=labels, sequence_lengths=input_length, transition_params=transitions) loss = tf.reduce_mean(-log_likelihood) return loss, transition_params
def get_loss(self): ''' dummy ctc loss, since ctc is implemented as a kearas layer ''' loss = {'ctc': lambda y_true, y_pred: tf.reduce_mean(y_pred)} return loss