Ejemplo n.º 1
0
def extract_feature(waveforms, params):
    '''extract fbank with delta-delta and do cmvn
     waveforms: [batch, samples]
  '''
    p = params
    with tf.variable_scope('feature_extractor'):
        mel_fbanks = extract_logfbank_with_delta(waveforms, params)
        # shape: [1, nframes, nbins, nchannels]
        fbank_size = utils.shape_list(mel_fbanks)
        #assert fbank_size[0] == 1

        # This replaces CMVN estimation on data
        if not p.audio_global_cmvn:
            mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
            variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                      keepdims=True,
                                      axis=1)
        else:
            assert p.audio_cmvn_path, p.audio_cmvn_path
            mean, variance = utils.load_cmvn(p.audio_cmvn_path)

        var_epsilon = 1e-09
        mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon)

        # Later models like to flatten the two spatial dims. Instead, we add a
        # unit spatial dim and flatten the frequencies and channels.
        batch_size = fbank_size[0]
        feats = tf.concat([
            tf.reshape(
                mel_fbanks,
                [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]),
            tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2],
                      fbank_size[3]))
        ], 1)
    return feats  # shape [batch_size, nframes, featue_size, chnanels]
Ejemplo n.º 2
0
 def preprocess(self, inputs):
     ''' Speech preprocessing. '''
     with tf.variable_scope('feature'):
         if self.input_type == 'samples':
             # FIXME: stub
             feats = None
         else:
             if 'cmvn_type' in self.audioconf:
                 cmvn_type = self.audioconf['cmvn_type']
             else:
                 cmvn_type = 'global'
             logging.info('cmvn_type: %s' % (cmvn_type))
             if cmvn_type == 'global':
                 self.mean, self.std = utils.load_cmvn(
                     self.audioconf['cmvn_path'])
                 feats = utils.apply_cmvn(inputs, self.mean, self.std)
             elif cmvn_type == 'local':
                 feats = utils.apply_local_cmvn(inputs)
             elif cmvn_type == 'sliding':
                 raise ValueError('cmvn_type %s not implemented yet.' %
                                  (cmvn_type))
             elif cmvn_type == 'none':
                 feats = inputs
             else:
                 raise ValueError('Error cmvn_type %s.' % (cmvn_type))
     return feats
Ejemplo n.º 3
0
    def testLoadCmvn(self):  #pylint: disable=invalid-name
        ''' test load cmvn '''
        np.random.seed(12)
        temp_dir = self.get_temp_dir()
        temp_file = os.path.join(temp_dir, 'cmvn.npy')

        feat_size = 40
        delta_deltas = True
        shape = [1, feat_size, 3 if delta_deltas else 1]
        mean = np.random.randn(*shape)
        var = np.random.randn(*shape)
        mean, var = mean.astype(np.float32), var.astype(np.float32)
        with tf.gfile.Open(temp_file, 'w') as f:  #pylint: disable=invalid-name
            np.save(f, (mean, var))

        mean_true = np.expand_dims(mean, axis=0)
        var_true = np.expand_dims(var, axis=0)

        with self.session(use_gpu=False, force_gpu=False):
            mean, var = utils.load_cmvn(temp_file)
            self.assertAllClose(mean.eval(), mean_true)
            self.assertAllClose(var.eval(), var_true)
  def preprocess(self, inputs, input_text):
    ''' preprocess speech and text inputs
    params:
      inputs: speech input
      input_text: text input
    '''
    with tf.variable_scope('feature'):
      if self.input_type == 'samples':
        # speech feature config
        self.hp = speech_params(
            sr=self.taskconf['audio']['sr'],
            bins=self.audioconf['feature_size'],
            dither=self.train,
            use_delta_deltas=self.audioconf['add_delta_deltas'],
            cmvn=self.audioconf['cmvn'],
            cmvn_path=self.audioconf['cmvn_path'])

        feats = extract_feature(inputs, params=self.hp)
      else:
        self.mean, self.std = utils.load_cmvn(self.audioconf['cmvn_path'])
        feats = utils.apply_cmvn(inputs, self.mean, self.std)
    return feats, input_text
Ejemplo n.º 5
0
 def __init__(self, cmvn_path):
     super().__init__(name='cmvn', trainable=False)
     self.mean, self.std = utils.load_cmvn(cmvn_path)
Ejemplo n.º 6
0
def main():
    data = dataset.make_dataset('train', config.train_path,
                                config.train_textgrid_path, FLAGS)
    train_data = dataloader.input_func(data,
                                       FLAGS.batch_size,
                                       is_train=True,
                                       num_epoch=1)

    data = dataset.make_dataset('dev', config.dev_path,
                                config.dev_textgrid_path, FLAGS)
    dev_data = dataloader.input_func(data, FLAGS.batch_size, is_train=False)

    # create model and optimizer
    step_counter = tf.train.get_or_create_global_step()
    model = model_lib.Emotion(drop_rate=0.1)

    lr = tf.train.exponential_decay(FLAGS.learning_rate,
                                    step_counter,
                                    100,
                                    FLAGS.decay_rate,
                                    staircase=True)
    optimizer = tf.train.AdamOptimizer(lr)
    print('init lr', lr().numpy())

    # checkpoint dirs
    if FLAGS.checkpoint:
        train_dir = os.path.join(FLAGS.checkpoint, 'train')
        eval_dir = os.path.join(FLAGS.checkpoint, 'eval')
        tf.gfile.MakeDirs(FLAGS.checkpoint)
    else:
        train_dir = None
        eval_dir = None
    summary_writer = tf.contrib.summary.create_file_writer(train_dir,
                                                           flush_millis=10000)
    eval_summary_writer = tf.contrib.summary.create_file_writer(
        eval_dir, flush_millis=10000, name='eval')

    # create and restore checkpoint ( if one exists on the graph)
    checkpoint_prefix = os.path.join(FLAGS.checkpoint, 'ckpt')
    checkpoint = tf.train.Checkpoint(
        #  model=model, optimizer=optimizer, learning_rate=lr, step_counter=step_counter)
        model=model,
        optimizer=optimizer,
        step_counter=step_counter)
    # restore variables on creation if a checkpoint exists.
    stats = checkpoint.restore(tf.train.latest_checkpoint(FLAGS.checkpoint))
    #stats.assert_consumed()
    print('now lr', lr().numpy())

    cmvn = utils.load_cmvn(FLAGS.cmvn_path)

    device = '/gpu:0' if tf.test.is_gpu_available() else '/cpu:0'
    print("Using device %s" % (device))

    with tf.device(device):
        for e in range(FLAGS.num_epochs):
            # train
            start = time.time()
            with summary_writer.as_default():
                train_one_epoch(model,
                                optimizer,
                                train_data,
                                step_counter,
                                cmvn,
                                log_interval=100)
            end = time.time()
            print(
                '\nTrain time for epoch #%d (%d total steps) (%f learning rate): %f'
                % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(),
                   lr().numpy(), end - start))

            if e == 0:
                print_vars(model)

            # eval
            with eval_summary_writer.as_default():
                eval(model, dev_data, cmvn)
            checkpoint.save(checkpoint_prefix)