def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = Corpus(stereo_corpus_dir, temp) d.initialize_corpus(dictionary) assert (d.get_feat_dim() == '39')
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') d = Corpus(basic_corpus_dir, output_directory) d.initialize_corpus(dictionary) assert (d.get_feat_dim() == '39')
def test_basic(basic_dict_path, generated_dir): d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) d.write() assert(set(d.phones) == set(['sil', 'sp','spn', 'phonea','phoneb','phonec'])) assert(set(d.positional_nonsil_phones) == set(['phonea_B','phonea_I','phonea_E', 'phonea_S', 'phoneb_B','phoneb_I','phoneb_E','phoneb_S', 'phonec_B','phonec_I','phonec_E','phonec_S']))
def test_basic(basic_dict_path, generated_dir): d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) d.write() assert set(d.phones) == {'sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec'} assert set(d.positional_nonsil_phones) == { 'phonea_B', 'phonea_I', 'phonea_E', 'phonea_S', 'phoneb_B', 'phoneb_I', 'phoneb_E', 'phoneb_S', 'phonec_B', 'phonec_I', 'phonec_E', 'phonec_S' }
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = Corpus(stereo_corpus_dir, temp) d.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(d) assert d.get_feat_dim(fc) == 39
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') c = Corpus(basic_corpus_dir, output_directory) c.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(c) assert c.get_feat_dim(fc) == 39
def test_basic(basic_dict_path, basic_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic")) dictionary.write() output_directory = os.path.join(generated_dir, "basic") d = Corpus(basic_dir, output_directory) d.write() d.create_mfccs() d.setup_splits(dictionary) assert d.get_feat_dim() == "39"
def test_stereo(basic_dict_path, textgrid_directory, generated_dir): temp = os.path.join(generated_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = Corpus(os.path.join(textgrid_directory, 'stereo'), temp) d.write() d.create_mfccs() d.setup_splits(dictionary) assert (d.get_feat_dim() == '39')
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') d = Corpus(basic_corpus_dir, output_directory) d.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(d) assert d.get_feat_dim(fc) == 39
def test_stereo(basic_dict_path, textgrid_directory, generated_dir): temp = os.path.join(generated_dir, "stereo") dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(os.path.join(textgrid_directory, "stereo"), temp) d.write() d.create_mfccs() d.setup_splits(dictionary) assert d.get_feat_dim() == "39"
def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') c = Corpus(basic_corpus_txt_dir, output_directory) assert len(c.no_transcription_files) == 0 c.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(c) assert c.get_feat_dim(fc) == 39
def test_basic(basic_dict_path, basic_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') d = Corpus(basic_dir, output_directory) d.write() d.create_mfccs() d.setup_splits(dictionary) assert (d.get_feat_dim() == '39')
def align_corpus(corpus_dir, dict_path, output_directory, temp_dir, output_model_path, args): if temp_dir == '': temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(temp_dir) corpus_name = os.path.basename(corpus_dir) if corpus_name == '': corpus_dir = os.path.dirname(corpus_dir) corpus_name = os.path.basename(corpus_dir) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(output_directory, exist_ok=True) dictionary = Dictionary(dict_path, data_directory) dictionary.write() corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs=args.num_jobs) print(corpus.speaker_utterance_info()) corpus.write() corpus.create_mfccs() corpus.setup_splits(dictionary) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, output_directory, temp_directory=data_directory, mono_params=mono_params, tri_params=tri_params, tri_fmllr_params=tri_fmllr_params, num_jobs=args.num_jobs) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if output_model_path is not None: a.save(output_model_path)
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'short_segments') dictionary = Dictionary(basic_dict_path, temp) dictionary.write() corpus = Corpus(shortsegments_corpus_dir, temp) corpus.initialize_corpus(dictionary) assert (len(corpus.feat_mapping.keys()) == 2) assert (len(corpus.utt_speak_mapping.keys()) == 2) assert (len(corpus.speak_utt_mapping.keys()) == 1) assert (len(corpus.text_mapping.keys()) == 2) assert (len(corpus.utt_wav_mapping.keys()) == 1) assert (len(corpus.segments.keys()) == 2) assert (len(corpus.ignored_utterances) == 1)
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary): output_directory = os.path.join(temp_dir, 'large_subset') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = Corpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) sd = c.split_directory() fc = FeatureConfig() fc.generate_features(c) s = c.subset_directory(10, fc) assert os.path.exists(sd) assert os.path.exists(s)
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'short_segments') dictionary = Dictionary(basic_dict_path, temp) dictionary.write() corpus = Corpus(shortsegments_corpus_dir, temp) corpus.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(corpus) assert len(corpus.feat_mapping.keys()) == 2 assert len(corpus.utt_speak_mapping.keys()) == 3 assert len(corpus.speak_utt_mapping.keys()) == 1 assert len(corpus.text_mapping.keys()) == 3 assert len(corpus.utt_wav_mapping.keys()) == 1 assert len(corpus.segments.keys()) == 3 assert len(corpus.ignored_utterances) == 1
def test_basic_noposition(basic_dict_path, generated_dir): d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'), position_dependent_phones=False) x = d.write() assert (set(d.phones) == set( ['sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec']))
def align_corpus(corpus_dir, dict_path, output_directory, temp_dir, output_model_path, args): if temp_dir == '': temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(temp_dir) corpus_name = os.path.basename(corpus_dir) if corpus_name == '': corpus_dir = os.path.dirname(corpus_dir) corpus_name = os.path.basename(corpus_dir) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors = True) shutil.rmtree(output_directory, ignore_errors = True) os.makedirs(data_directory, exist_ok = True) os.makedirs(output_directory, exist_ok = True) corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs = args.num_jobs) print(corpus.speaker_utterance_info()) corpus.write() corpus.create_mfccs() dictionary = Dictionary(dict_path, data_directory, word_set=corpus.word_set) dictionary.write() corpus.setup_splits(dictionary) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, output_directory, temp_directory = data_directory, mono_params = mono_params, tri_params = tri_params, tri_fmllr_params = tri_fmllr_params, num_jobs = args.num_jobs) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if output_model_path is not None: a.save(output_model_path)
def test_speaker_groupings(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary): output_directory = os.path.join(temp_dir, 'large') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = Corpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) fc = FeatureConfig() fc.generate_features(c) speakers = os.listdir(large_prosodylab_format_directory) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping) shutil.rmtree(output_directory, ignore_errors=True) d.write() c = Corpus(large_prosodylab_format_directory, output_directory, num_jobs=2) c.initialize_corpus(d) fc.generate_features(c) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping)
def test_frclitics(frclitics_dict_path, generated_dir): d = Dictionary(frclitics_dict_path, os.path.join(generated_dir, 'frclitics')) x = d.write() assert d.separate_clitics('aujourd') == ['aujourd'] assert d.separate_clitics('aujourd\'hui') == ['aujourd\'hui'] assert d.separate_clitics('vingt-six') == ['vingt', 'six'] assert d.separate_clitics('m\'appelle') == ['m\'', 'appelle'] assert d.separate_clitics('c\'est') == ['c\'est'] assert d.separate_clitics('purple-people-eater') == ['purple-people-eater'] assert d.separate_clitics('m\'appele') == ['m\'', 'appele'] assert d.separate_clitics('m\'ving-sic') == ["m'", 'ving', 'sic'] assert d.separate_clitics('flying\'purple-people-eater') == ['flying\'purple-people-eater']
def test_frclitics(frclitics_dict_path, generated_dir): d = Dictionary(frclitics_dict_path, os.path.join(generated_dir, 'frclitics')) x = d.write() assert d.separate_clitics('aujourd') == ['aujourd'] assert d.separate_clitics('aujourd\'hui') == ['aujourd\'hui'] assert d.separate_clitics('vingt-six') == ['vingt', 'six'] assert d.separate_clitics('m\'appelle') == ['m\'', 'appelle'] assert d.separate_clitics('c\'est') == ['c\'est'] assert d.separate_clitics('purple-people-eater') == ['purple-people-eater'] assert d.separate_clitics('m\'appele') == ['m\'', 'appele'] assert d.separate_clitics('m\'ving-sic') == ["m'", 'ving', 'sic'] assert d.separate_clitics('flying\'purple-people-eater') == [ 'flying\'purple-people-eater' ]
def test_extra_annotations(extra_annotations_path, generated_dir): d = Dictionary(extra_annotations_path, os.path.join(generated_dir, 'extra')) assert ('{' in d.graphemes) d.write()
def sick_dict(sick_dict_path, generated_dir): output_directory = os.path.join(generated_dir, 'sickcorpus') dictionary = Dictionary(sick_dict_path, output_directory) dictionary.write() return dictionary
def test_basic_noposition(basic_dict_path, generated_dir): d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'), position_dependent_phones = False) x = d.write() assert(set(d.phones) == set(['sil', 'sp','spn', 'phonea','phoneb','phonec']))
def test_extra_annotations(extra_annotations_path, generated_dir): d = Dictionary(extra_annotations_path, os.path.join(generated_dir, 'extra')) assert('{' in d.graphemes) d.write()