def test_split_by_length_of_utterances(benchmark):
    corpus = resources.generate_corpus(179, (250, 500), (1, 9), (0, 6),
                                       (1, 20), random.Random(x=234))

    splitter = subset.Splitter(corpus, random_seed=324)

    benchmark(run, splitter)
예제 #2
0
def create_train_dev_test(corpus):
    """
    Create train/dev/test subsets of the given corpus.
    Size is computed using length of the transcriptions.
    """

    total_duration = corpus.total_duration
    test_dev_train_ratio = MAX_DEV_TEST_DURATION / total_duration

    if test_dev_train_ratio > 0.15:
        test_dev_train_ratio = 0.15

    splitter = subset.Splitter(corpus, SEED)
    subviews = splitter.split_by_label_length(
        proportions={
            'train': 1.0 - (2 * test_dev_train_ratio),
            'dev': test_dev_train_ratio,
            'test': test_dev_train_ratio,
        },
        label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT,
        separate_issuers=True)

    return subviews['train'], subviews['dev'], subviews['test']
    if voxforge_path is not None:
        voxforge_corpus = audiomate.Corpus.load(voxforge_path,
                                                reader='voxforge')
        corpora.append(voxforge_corpus)

    if swc_path is not None:
        swc_corpus = audiomate.Corpus.load(swc_path, reader='kaldi')
        corpora.append(swc_corpus)

    if len(corpora) <= 0:
        raise ValueError('No Corpus given!')

    merged_corpus = audiomate.Corpus.merge_corpora(corpora)
    clean_transcriptions(merged_corpus)

    splitter = subset.Splitter(merged_corpus, random_seed=38)
    splits = splitter.split_by_length_of_utterances(
        {
            'train': 0.7,
            'dev': 0.15,
            'test': 0.15
        }, separate_issuers=True)

    merged_corpus.import_subview('train', splits['train'])
    merged_corpus.import_subview('dev', splits['dev'])
    merged_corpus.import_subview('test', splits['test'])

    deepspeech_writer = io.MozillaDeepSpeechWriter()
    deepspeech_writer.save(merged_corpus, args.target_path)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(description="Prepare data for training.")
    parser.add_argument("target_path", type=str)
    parser.add_argument("--tuda", type=str)
    parser.add_argument("--voxforge", type=str)
    parser.add_argument("--swc", type=str)
    parser.add_argument("--mailabs", type=str)
    parser.add_argument("--common_voice", type=str)
    parser.add_argument("--tatoeba", type=str)
    parser.add_argument("--css_german", type=str)
    parser.add_argument("--zamia_speech", type=str)

    args = parser.parse_args()

    tuda_path = args.tuda
    voxforge_path = args.voxforge
    swc_path = args.swc
    mailabs_path = args.mailabs
    cv_path = args.common_voice
    tatoeba_path = args.tatoeba
    css_path = args.css_german
    zs_path = args.zamia_speech

    corpora = []

    if tuda_path is not None:
        print("Loading tuda ...")
        corpus = audiomate.Corpus.load(tuda_path, reader="tuda")
        corpora.append(corpus)

    if voxforge_path is not None:
        print("Loading voxforge ...")
        corpus = audiomate.Corpus.load(voxforge_path, reader="voxforge")
        corpora.append(corpus)

    if swc_path is not None:
        print("Loading swc ...")
        corpus = audiomate.Corpus.load(swc_path, reader="swc")
        corpora.append(corpus)

    if mailabs_path is not None:
        print("Loading mailabs ...")
        corpus = audiomate.Corpus.load(mailabs_path, reader="mailabs")
        corpora.append(corpus)

    if cv_path is not None:
        print("Loading common-voice ...")
        corpus = audiomate.Corpus.load(cv_path, reader="common-voice")
        corpora.append(corpus)

    if tatoeba_path is not None:
        print("Loading tatoeba ...")
        corpus = audiomate.Corpus.load(tatoeba_path, reader="tatoeba")
        corpora.append(corpus)

    if css_path is not None:
        print("Loading css-german ...")
        corpus = audiomate.Corpus.load(css_path, reader="css10")
        corpora.append(corpus)

    if zs_path is not None:
        print("Loading zamia-speech ...")
        corpus = audiomate.Corpus.load(zs_path, reader="zamia-speech")
        corpora.append(corpus)

    if len(corpora) <= 0:
        raise ValueError("No Corpus given!")

    merged_corpus = audiomate.Corpus.merge_corpora(corpora)
    clean_transcriptions(merged_corpus)

    print("Splitting corpus ...")
    splitter = subset.Splitter(merged_corpus, random_seed=38)
    split_sizes = {"train": 0.7, "dev": 0.15, "test": 0.15}
    if css_path is not None and len(corpora) == 1:
        splits = splitter.split(split_sizes, separate_issuers=False)
    else:
        splits = splitter.split(split_sizes, separate_issuers=True)

    merged_corpus.import_subview("train", splits["train"])
    merged_corpus.import_subview("dev", splits["dev"])
    merged_corpus.import_subview("test", splits["test"])

    print("Saving corpus ...")
    deepspeech_writer = io.MozillaDeepSpeechWriter()
    deepspeech_writer.save(merged_corpus, args.target_path)