Ejemplo n.º 1
0
 def preprocess(self, inputs):
     ''' Speech preprocessing. '''
     with tf.variable_scope('feature'):
         if self.input_type == 'samples':
             # FIXME: stub
             feats = None
         else:
             if 'cmvn_type' in self.audioconf:
                 cmvn_type = self.audioconf['cmvn_type']
             else:
                 cmvn_type = 'global'
             logging.info('cmvn_type: %s' % (cmvn_type))
             if cmvn_type == 'global':
                 self.mean, self.std = utils.load_cmvn(
                     self.audioconf['cmvn_path'])
                 feats = utils.apply_cmvn(inputs, self.mean, self.std)
             elif cmvn_type == 'local':
                 feats = utils.apply_local_cmvn(inputs)
             elif cmvn_type == 'sliding':
                 raise ValueError('cmvn_type %s not implemented yet.' %
                                  (cmvn_type))
             elif cmvn_type == 'none':
                 feats = inputs
             else:
                 raise ValueError('Error cmvn_type %s.' % (cmvn_type))
     return feats
Ejemplo n.º 2
0
def extract_feature(waveforms, params):
    '''extract fbank with delta-delta and do cmvn
     waveforms: [batch, samples]
  '''
    p = params
    with tf.variable_scope('feature_extractor'):
        mel_fbanks = extract_logfbank_with_delta(waveforms, params)
        # shape: [1, nframes, nbins, nchannels]
        fbank_size = utils.shape_list(mel_fbanks)
        #assert fbank_size[0] == 1

        # This replaces CMVN estimation on data
        if not p.audio_global_cmvn:
            mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
            variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                      keepdims=True,
                                      axis=1)
        else:
            assert p.audio_cmvn_path, p.audio_cmvn_path
            mean, variance = utils.load_cmvn(p.audio_cmvn_path)

        var_epsilon = 1e-09
        mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon)

        # Later models like to flatten the two spatial dims. Instead, we add a
        # unit spatial dim and flatten the frequencies and channels.
        batch_size = fbank_size[0]
        feats = tf.concat([
            tf.reshape(
                mel_fbanks,
                [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]),
            tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2],
                      fbank_size[3]))
        ], 1)
    return feats  # shape [batch_size, nframes, featue_size, chnanels]
Ejemplo n.º 3
0
def train_one_epoch(model,
                    optimizer,
                    dataset,
                    step_counter,
                    cmvn,
                    log_interval=None):
    assert len(cmvn) == 2
    start = time.time()
    for (batch, (feats, texts, labels, filenames,
                 clip_ids)) in enumerate(dataset):
        feats = utils.apply_cmvn(feats, cmvn[0], cmvn[1])

        with tf.contrib.summary.record_summaries_every_n_global_steps(
                10, global_step=step_counter):
            with tf.GradientTape() as tape:
                logits = model(feats, training=True)
                loss_value = utils.loss(logits,
                                        labels,
                                        smoothing=0.0,
                                        is_train=True)
                tf.contrib.summary.scalar('loss', loss_value)
                tf.contrib.summary.scalar('accuracy',
                                          utils.accuracy(logits, labels))

            grads = tape.gradient(loss_value, model.variables)
            optimizer.apply_gradients(utils.clip_gradients(
                list(zip(grads, model.variables)),
                clip_ratio=FLAGS.clip_global_norm),
                                      global_step=step_counter)

            if log_interval and batch % log_interval == 0:
                rate = log_interval / (time.time() - start)
                print('Step #%d\tLoss: %0.6f (%d step/sec)' %
                      (batch, loss_value, rate))
                start = time.time()
Ejemplo n.º 4
0
def eval(model, dataset, cmvn):
    avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32)
    accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32)

    for (batch, (feats, texts, labels, filenames,
                 clip_ids)) in enumerate(dataset):
        feats = utils.apply_cmvn(feats, cmvn[0], cmvn[1])

        logits = model(feats, training=False)
        avg_loss(utils.loss(logits, labels, is_train=False))
        accuracy(tf.argmax(logits, axis=-1, output_type=tf.int64),
                 tf.cast(labels, tf.int64))
        print("Eval set: Average loss: %0.4f, Accuracy: %4f%%\n" %
              (avg_loss.result(), 100 * accuracy.result()))

        with tf.contrib.summary.always_record_summaries():
            tf.contrib.summary.scalar('loss', avg_loss.result())
            tf.contrib.summary.scalar('accuracy', accuracy.result())
Ejemplo n.º 5
0
    def testApplyCmvn(self):  #pylint: disable=invalid-name
        ''' test apply cmvn '''
        np.random.seed(12)
        tf.set_random_seed(12)

        feat_size = 40
        delta_deltas = True

        feat_shape = [2, 10, feat_size, 3 if delta_deltas else 1]
        feat = np.random.randn(*feat_shape)
        feat = feat.astype(np.float32)

        feat = tf.constant(feat)
        mean = feat / 2
        var = feat / 3

        eps = 1e-9
        feat_out = utils.apply_cmvn(feat, mean, var, epsilon=eps)
        feat_true = (feat - mean) * tf.rsqrt(var + eps)
        with self.session(use_gpu=False, force_gpu=False):
            self.assertAllClose(feat_out.eval(), feat_true.eval())
  def preprocess(self, inputs, input_text):
    ''' preprocess speech and text inputs
    params:
      inputs: speech input
      input_text: text input
    '''
    with tf.variable_scope('feature'):
      if self.input_type == 'samples':
        # speech feature config
        self.hp = speech_params(
            sr=self.taskconf['audio']['sr'],
            bins=self.audioconf['feature_size'],
            dither=self.train,
            use_delta_deltas=self.audioconf['add_delta_deltas'],
            cmvn=self.audioconf['cmvn'],
            cmvn_path=self.audioconf['cmvn_path'])

        feats = extract_feature(inputs, params=self.hp)
      else:
        self.mean, self.std = utils.load_cmvn(self.audioconf['cmvn_path'])
        feats = utils.apply_cmvn(inputs, self.mean, self.std)
    return feats, input_text
Ejemplo n.º 7
0
 def call(self, x):
     x = utils.apply_cmvn(x, self.mean, self.std)
     return x