def preprocess(self, inputs): ''' Speech preprocessing. ''' with tf.variable_scope('feature'): if self.input_type == 'samples': # FIXME: stub feats = None else: if 'cmvn_type' in self.audioconf: cmvn_type = self.audioconf['cmvn_type'] else: cmvn_type = 'global' logging.info('cmvn_type: %s' % (cmvn_type)) if cmvn_type == 'global': self.mean, self.std = utils.load_cmvn( self.audioconf['cmvn_path']) feats = utils.apply_cmvn(inputs, self.mean, self.std) elif cmvn_type == 'local': feats = utils.apply_local_cmvn(inputs) elif cmvn_type == 'sliding': raise ValueError('cmvn_type %s not implemented yet.' % (cmvn_type)) elif cmvn_type == 'none': feats = inputs else: raise ValueError('Error cmvn_type %s.' % (cmvn_type)) return feats
def extract_feature(waveforms, params): '''extract fbank with delta-delta and do cmvn waveforms: [batch, samples] ''' p = params with tf.variable_scope('feature_extractor'): mel_fbanks = extract_logfbank_with_delta(waveforms, params) # shape: [1, nframes, nbins, nchannels] fbank_size = utils.shape_list(mel_fbanks) #assert fbank_size[0] == 1 # This replaces CMVN estimation on data if not p.audio_global_cmvn: mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) else: assert p.audio_cmvn_path, p.audio_cmvn_path mean, variance = utils.load_cmvn(p.audio_cmvn_path) var_epsilon = 1e-09 mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon) # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. batch_size = fbank_size[0] feats = tf.concat([ tf.reshape( mel_fbanks, [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2], fbank_size[3])) ], 1) return feats # shape [batch_size, nframes, featue_size, chnanels]
def train_one_epoch(model, optimizer, dataset, step_counter, cmvn, log_interval=None): assert len(cmvn) == 2 start = time.time() for (batch, (feats, texts, labels, filenames, clip_ids)) in enumerate(dataset): feats = utils.apply_cmvn(feats, cmvn[0], cmvn[1]) with tf.contrib.summary.record_summaries_every_n_global_steps( 10, global_step=step_counter): with tf.GradientTape() as tape: logits = model(feats, training=True) loss_value = utils.loss(logits, labels, smoothing=0.0, is_train=True) tf.contrib.summary.scalar('loss', loss_value) tf.contrib.summary.scalar('accuracy', utils.accuracy(logits, labels)) grads = tape.gradient(loss_value, model.variables) optimizer.apply_gradients(utils.clip_gradients( list(zip(grads, model.variables)), clip_ratio=FLAGS.clip_global_norm), global_step=step_counter) if log_interval and batch % log_interval == 0: rate = log_interval / (time.time() - start) print('Step #%d\tLoss: %0.6f (%d step/sec)' % (batch, loss_value, rate)) start = time.time()
def eval(model, dataset, cmvn): avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32) accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32) for (batch, (feats, texts, labels, filenames, clip_ids)) in enumerate(dataset): feats = utils.apply_cmvn(feats, cmvn[0], cmvn[1]) logits = model(feats, training=False) avg_loss(utils.loss(logits, labels, is_train=False)) accuracy(tf.argmax(logits, axis=-1, output_type=tf.int64), tf.cast(labels, tf.int64)) print("Eval set: Average loss: %0.4f, Accuracy: %4f%%\n" % (avg_loss.result(), 100 * accuracy.result())) with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('loss', avg_loss.result()) tf.contrib.summary.scalar('accuracy', accuracy.result())
def testApplyCmvn(self): #pylint: disable=invalid-name ''' test apply cmvn ''' np.random.seed(12) tf.set_random_seed(12) feat_size = 40 delta_deltas = True feat_shape = [2, 10, feat_size, 3 if delta_deltas else 1] feat = np.random.randn(*feat_shape) feat = feat.astype(np.float32) feat = tf.constant(feat) mean = feat / 2 var = feat / 3 eps = 1e-9 feat_out = utils.apply_cmvn(feat, mean, var, epsilon=eps) feat_true = (feat - mean) * tf.rsqrt(var + eps) with self.session(use_gpu=False, force_gpu=False): self.assertAllClose(feat_out.eval(), feat_true.eval())
def preprocess(self, inputs, input_text): ''' preprocess speech and text inputs params: inputs: speech input input_text: text input ''' with tf.variable_scope('feature'): if self.input_type == 'samples': # speech feature config self.hp = speech_params( sr=self.taskconf['audio']['sr'], bins=self.audioconf['feature_size'], dither=self.train, use_delta_deltas=self.audioconf['add_delta_deltas'], cmvn=self.audioconf['cmvn'], cmvn_path=self.audioconf['cmvn_path']) feats = extract_feature(inputs, params=self.hp) else: self.mean, self.std = utils.load_cmvn(self.audioconf['cmvn_path']) feats = utils.apply_cmvn(inputs, self.mean, self.std) return feats, input_text
def call(self, x): x = utils.apply_cmvn(x, self.mean, self.std) return x