def test_featurizer():
    config = {
        "output_path_prefix":
        "/data/models/asr/conformer_sentencepiece_subword",
        "model_type":
        "unigram",
        "target_vocab_size":
        8000,
        "blank_at_zero":
        True,
        "beam_width":
        5,
        "norm_score":
        True,
        "corpus_files": [
            "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv"
        ]
    }

    config_speech = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "num_feature_bins": 80,
        'feature_type': "log_mel_spectrogram",
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_feature": False
    }

    text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file(
        config, None)
    subwords_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                 os.pardir, os.pardir, "vocabularies",
                                 "librispeech_train_4_1030.subwords")
    text_featurizer_subwords = SubwordFeaturizer.load_from_file(
        config, subwords_path)
    speech_featurizer = TFSpeechFeaturizer(config_speech)
    data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             "transcripts_librispeech_train_clean_100.tsv")

    def get_data(featurizer: TextFeaturizer):
        train_dataset = ASRSliceDataset(data_paths=[data_path],
                                        speech_featurizer=speech_featurizer,
                                        text_featurizer=featurizer,
                                        stage="train",
                                        shuffle=False)
        train_data = train_dataset.create(1)
        return next(iter(train_data))

    data_sentencepiece = get_data(text_featurizer_sentencepiece)
    data_subwords = get_data(text_featurizer_subwords)

    assert len(data_sentencepiece) == len(data_subwords)
    assert data_sentencepiece[0].shape == data_subwords[0].shape
    assert data_sentencepiece[0].dtype == data_subwords[0].dtype
Esempio n. 2
0
def test_iextract():
    config = {
        "output_path_prefix": "/data/models/asr/conformer_sentencepiece_subword",
        "model_type": "unigram",
        "target_vocab_size": 8000,
        "blank_at_zero": True,
        "beam_width": 5,
        "norm_score": True,
        "corpus_files": [
            "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv"
            "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv"]}

    config_speech = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "num_feature_bins": 80,
        'feature_type': "log_mel_spectrogram",
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_frame": False}

    text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file(config, None)
    speech_featurizer = TFSpeechFeaturizer(config_speech)
    data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv")

    train_dataset = ASRSliceTestDataset(
        data_paths=[data_path],
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer_sentencepiece,
        stage="train",
        shuffle=False
    )
    train_data = train_dataset.create(1)
    batch = next(iter(train_data))
    file_paths, features, input_length, labels = batch
    labels = text_featurizer_sentencepiece.iextract(labels)
    labels = labels.numpy()[0].decode("utf-8")

    # Open transcript
    file_path = file_paths[0].numpy().decode("utf-8")
    file_path = re.sub(r"(?<!\s)-[0-9]{4}.flac", ".trans.txt", file_path)
    print(file_path)
    with open(file_path, "r") as f:
        lines = f.read().splitlines()
    m = re.search(r"[0-9]+-[0-9]+-[0-9]+\s+([\w\s]+)", lines[0])
    transcript = m.groups(1)[0].lower()

    assert(labels == transcript)
parser.add_argument("transcripts",
                    nargs="+",
                    type=str,
                    default=None,
                    help="Paths to transcript files")

args = parser.parse_args()

transcripts = preprocess_paths(args.transcripts)
tfrecords_dir = preprocess_paths(args.tfrecords_dir)

config = Config(args.config)

if args.sentence_piece:
    print("Loading SentencePiece model ...")
    text_featurizer = SentencePieceFeaturizer.load_from_file(
        config.decoder_config, args.subwords)
elif args.subwords and os.path.exists(args.subwords):
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config,
                                                       args.subwords)

ASRTFRecordDataset(data_paths=transcripts,
                   tfrecords_dir=tfrecords_dir,
                   speech_featurizer=None,
                   text_featurizer=text_featurizer,
                   stage=args.mode,
                   shuffle=args.shuffle,
                   tfrecords_shards=args.tfrecords_shards).create_tfrecords()
Esempio n. 4
0
env_util.setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
from tensorflow_asr.models.ctc.jasper import Jasper
from tensorflow_asr.utils import app_util

config = Config(args.config)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)

if args.sentence_piece:
    print("Use SentencePiece ...")
    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
elif args.subwords:
    print("Use subwords ...")
    text_featurizer = SubwordFeaturizer(config.decoder_config)
else:
    print("Use characters ...")
    text_featurizer = CharFeaturizer(config.decoder_config)

tf.random.set_seed(0)

if args.tfrecords:
    test_dataset = ASRTFRecordDataset(
        speech_featurizer=speech_featurizer,
        text_featurizer=text_featurizer,
        **vars(config.learning_config.test_dataset_config))
else:
Esempio n. 5
0
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                            "config.yml")

tf.keras.backend.clear_session()

parser = argparse.ArgumentParser(prog="Vocab Training with SentencePiece")

parser.add_argument("--config",
                    type=str,
                    default=DEFAULT_YAML,
                    help="The file path of model configuration file")

parser.add_argument("--devices",
                    type=int,
                    nargs="*",
                    default=[0],
                    help="Devices' ids to apply distributed training")

args = parser.parse_args()

strategy = setup_strategy(args.devices)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.text_featurizers import SentencePieceFeaturizer

config = Config(args.config)

logger.info("Generating subwords ...")
text_featurizer = SentencePieceFeaturizer.build_from_corpus(
    config.decoder_config)