def test_split_by_length_of_utterances(benchmark): corpus = resources.generate_corpus(179, (250, 500), (1, 9), (0, 6), (1, 20), random.Random(x=234)) splitter = subset.Splitter(corpus, random_seed=324) benchmark(run, splitter)
def create_train_dev_test(corpus): """ Create train/dev/test subsets of the given corpus. Size is computed using length of the transcriptions. """ total_duration = corpus.total_duration test_dev_train_ratio = MAX_DEV_TEST_DURATION / total_duration if test_dev_train_ratio > 0.15: test_dev_train_ratio = 0.15 splitter = subset.Splitter(corpus, SEED) subviews = splitter.split_by_label_length( proportions={ 'train': 1.0 - (2 * test_dev_train_ratio), 'dev': test_dev_train_ratio, 'test': test_dev_train_ratio, }, label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT, separate_issuers=True) return subviews['train'], subviews['dev'], subviews['test']
if voxforge_path is not None: voxforge_corpus = audiomate.Corpus.load(voxforge_path, reader='voxforge') corpora.append(voxforge_corpus) if swc_path is not None: swc_corpus = audiomate.Corpus.load(swc_path, reader='kaldi') corpora.append(swc_corpus) if len(corpora) <= 0: raise ValueError('No Corpus given!') merged_corpus = audiomate.Corpus.merge_corpora(corpora) clean_transcriptions(merged_corpus) splitter = subset.Splitter(merged_corpus, random_seed=38) splits = splitter.split_by_length_of_utterances( { 'train': 0.7, 'dev': 0.15, 'test': 0.15 }, separate_issuers=True) merged_corpus.import_subview('train', splits['train']) merged_corpus.import_subview('dev', splits['dev']) merged_corpus.import_subview('test', splits['test']) deepspeech_writer = io.MozillaDeepSpeechWriter() deepspeech_writer.save(merged_corpus, args.target_path)
def main(): parser = argparse.ArgumentParser(description="Prepare data for training.") parser.add_argument("target_path", type=str) parser.add_argument("--tuda", type=str) parser.add_argument("--voxforge", type=str) parser.add_argument("--swc", type=str) parser.add_argument("--mailabs", type=str) parser.add_argument("--common_voice", type=str) parser.add_argument("--tatoeba", type=str) parser.add_argument("--css_german", type=str) parser.add_argument("--zamia_speech", type=str) args = parser.parse_args() tuda_path = args.tuda voxforge_path = args.voxforge swc_path = args.swc mailabs_path = args.mailabs cv_path = args.common_voice tatoeba_path = args.tatoeba css_path = args.css_german zs_path = args.zamia_speech corpora = [] if tuda_path is not None: print("Loading tuda ...") corpus = audiomate.Corpus.load(tuda_path, reader="tuda") corpora.append(corpus) if voxforge_path is not None: print("Loading voxforge ...") corpus = audiomate.Corpus.load(voxforge_path, reader="voxforge") corpora.append(corpus) if swc_path is not None: print("Loading swc ...") corpus = audiomate.Corpus.load(swc_path, reader="swc") corpora.append(corpus) if mailabs_path is not None: print("Loading mailabs ...") corpus = audiomate.Corpus.load(mailabs_path, reader="mailabs") corpora.append(corpus) if cv_path is not None: print("Loading common-voice ...") corpus = audiomate.Corpus.load(cv_path, reader="common-voice") corpora.append(corpus) if tatoeba_path is not None: print("Loading tatoeba ...") corpus = audiomate.Corpus.load(tatoeba_path, reader="tatoeba") corpora.append(corpus) if css_path is not None: print("Loading css-german ...") corpus = audiomate.Corpus.load(css_path, reader="css10") corpora.append(corpus) if zs_path is not None: print("Loading zamia-speech ...") corpus = audiomate.Corpus.load(zs_path, reader="zamia-speech") corpora.append(corpus) if len(corpora) <= 0: raise ValueError("No Corpus given!") merged_corpus = audiomate.Corpus.merge_corpora(corpora) clean_transcriptions(merged_corpus) print("Splitting corpus ...") splitter = subset.Splitter(merged_corpus, random_seed=38) split_sizes = {"train": 0.7, "dev": 0.15, "test": 0.15} if css_path is not None and len(corpora) == 1: splits = splitter.split(split_sizes, separate_issuers=False) else: splits = splitter.split(split_sizes, separate_issuers=True) merged_corpus.import_subview("train", splits["train"]) merged_corpus.import_subview("dev", splits["dev"]) merged_corpus.import_subview("test", splits["test"]) print("Saving corpus ...") deepspeech_writer = io.MozillaDeepSpeechWriter() deepspeech_writer.save(merged_corpus, args.target_path)