def _CreateAsrFeatures():
    # First pass: extract transcription files.
    if False:  #os.path.exists(FLAGS.transcripts_filepath):
        trans = _LoadTranscriptionsFromFile()
    else:
        tf.logging.info('Running first pass on the fly')
        trans = _ReadTranscriptionsFromCSV()
    total_utts = len(trans)
    tf.logging.info('Total transcripts: %d', len(trans))
    tf_bytes = tf.placeholder(dtype=tf.string)
    log_mel = audio_lib.ExtractLogMelFeatures(tf_bytes)
    # Second pass: transcode the flac.
    file_obj = tf.io.gfile.GFile(FLAGS.input_tarball, mode='rb')
    tar = tarfile.open(fileobj=file_obj, mode='r:gz')
    n = 0
    recordio_writers = _OpenSubShards()
    tfconf = tf.config_pb2.ConfigProto()
    tfconf.gpu_options.allow_growth = True
    with tf.Session(config=tfconf) as sess:
        for tarinfo in tar:
            # We can actually decode essentially any audio format, but we
            # want to avoid non-audio data. Thus, this condition.
            if not (tarinfo.name.endswith('.flac')
                    or tarinfo.name.endswith('.wav')
                    or tarinfo.name.endswith('.mp3')):
                continue
            n += 1
            if n % FLAGS.num_shards != FLAGS.shard_id:
                continue
            f = tar.extractfile(tarinfo)
            fmt = tarinfo.name.split('.')[-1]
            uttid = tarinfo.name
            audio_bytes = f.read()
            f.close()
            try:
                wav_bytes = audio_lib.DecodeToWav(audio_bytes, fmt)
                frames = sess.run(log_mel, feed_dict={tf_bytes: wav_bytes})
            except Exception as e:
                # raise
                trans.pop(uttid)
                tf.logging.info(f'{uttid} FAILED featurization')
                continue
            assert uttid in trans, uttid
            num_words = len(trans[uttid])
            tf.logging.info('utt[%d]: %s [%d frames, %d chars]', n, uttid,
                            frames.shape[1], num_words)
            ex = _MakeTfExample(uttid, frames, trans[uttid])
            outf = _SelectRandomShard(recordio_writers)
            outf.write(ex.SerializeToString())
        tar.close()
    file_obj.close()
    _CloseSubShards(recordio_writers)
    tf.logging.info(f'Processed {len(trans)} / {total_utts}')
Ejemplo n.º 2
0
    def testExtractLogMelFeatures(self):
        with open(
                test_helper.test_src_dir_path(
                    'tools/testdata/gan_or_vae.16k.wav'), 'rb') as f:
            wav = f.read()

        wav_bytes_t = tf.constant(wav, dtype=tf.string)
        log_mel_t = audio_lib.ExtractLogMelFeatures(wav_bytes_t)

        with self.session() as sess:
            log_mel = sess.run(log_mel_t)
            # Expect 314, 80 dimensional channels.
            self.assertAllEqual(log_mel.shape, [1, 314, 80, 1])
Ejemplo n.º 3
0
    def testExtractLogMelFeatures(self):
        with open(
                test_helper.test_src_dir_path(
                    'tools/testdata/gan_or_vae.16k.wav'), 'r') as f:
            wav = f.read()

        wav_bytes_t = tf.constant(wav, dtype=tf.string)
        log_mel_t = audio_lib.ExtractLogMelFeatures(wav_bytes_t)

        with self.session() as sess:
            log_mel = sess.run(log_mel_t)
            # We expect 105 frames, each of which consists of three 80 dimensional
            # stacked frames.
            self.assertAllEqual(log_mel.shape, [1, 105, 80 * 3, 1])
Ejemplo n.º 4
0
def _CreateAsrFeatures():
    # First pass: extract transcription files.
    if os.path.exists(FLAGS.transcripts_filepath):
        trans = _LoadTranscriptionsFromFile()
    else:
        tf.logging.info('Running first pass on the fly')
        trans = _ReadTranscriptions()
    tf.logging.info('Total transcripts: %d', len(trans))
    tf_bytes = tf.placeholder(dtype=tf.string)
    # Great! It uses the frontend directly
    log_mel = audio_lib.ExtractLogMelFeatures(tf_bytes)
    # Second pass: transcode the flac.
    file_obj = tf.io.gfile.GFile(FLAGS.input_tarball, mode='rb')
    tar = tarfile.open(fileobj=file_obj, mode='r:gz')
    n = 0
    recordio_writers = _OpenSubShards()
    tfconf = tf.config_pb2.ConfigProto()
    tfconf.gpu_options.allow_growth = True
    with tf.Session(config=tfconf) as sess:
        for tarinfo in tar:
            if not tarinfo.name.endswith('.flac'):
                continue
            n += 1
            if n % FLAGS.num_shards != FLAGS.shard_id:
                continue
            uttid = re.sub('.*/(.+)\\.flac', '\\1', tarinfo.name)
            f = tar.extractfile(tarinfo)
            wav_bytes = audio_lib.DecodeFlacToWav(f.read())
            f.close()
            frames = sess.run(log_mel, feed_dict={tf_bytes: wav_bytes})
            assert uttid in trans, uttid
            num_words = len(trans[uttid])
            tf.logging.info('utt[%d]: %s [%d frames, %d words]', n, uttid,
                            frames.shape[1], num_words)
            ex = _MakeTfExample(uttid, frames, trans[uttid])
            outf = _SelectRandomShard(recordio_writers)
            outf.write(ex.SerializeToString())
        tar.close()
    file_obj.close()
    _CloseSubShards(recordio_writers)