def test_speaker_groupings(large_prosodylab_format_directory, generated_dir): output_directory = os.path.join(generated_dir, 'large') shutil.rmtree(output_directory, ignore_errors=True) c = Corpus(large_prosodylab_format_directory, output_directory) speakers = os.listdir(large_prosodylab_format_directory) for s in speakers: assert (any(s in x for x in c.speaker_groups)) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert (any(name in x for x in c.groups)) c.create_mfccs() for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert (any(name in x for x in c.feat_mapping)) shutil.rmtree(output_directory, ignore_errors=True) c = Corpus(large_prosodylab_format_directory, output_directory, num_jobs=2) for s in speakers: assert (any(s in x for x in c.speaker_groups)) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert (any(name in x for x in c.groups)) c.create_mfccs() for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert (any(name in x for x in c.feat_mapping))
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = Corpus(stereo_corpus_dir, temp) d.initialize_corpus(dictionary) assert (d.get_feat_dim() == '39')
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') d = Corpus(basic_corpus_dir, output_directory) d.initialize_corpus(dictionary) assert (d.get_feat_dim() == '39')
def validate_corpus(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3)) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) a = CorpusValidator(corpus, dictionary, temp_directory=data_directory, ignore_acoustics=getattr(args, 'ignore_acoustics', False), test_transcriptions=getattr(args, 'test_transcriptions', False)) a.validate()
def generate_orthography_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR temp_dir = os.path.join(temp_dir, 'G2P') else: temp_dir = os.path.expanduser(args.temp_directory) if os.path.isdir(args.input_path): input_dir = os.path.expanduser(args.input_path) corpus_name = os.path.basename(args.input_path) if corpus_name == '': args.input_path = os.path.dirname(args.input_path) corpus_name = os.path.basename(args.input_path) data_directory = os.path.join(temp_dir, corpus_name) corpus = Corpus(input_dir, data_directory) word_set = get_word_set(corpus, args.include_bracketed) else: word_set = set() with open(args.input_path, 'r', encoding='utf8') as f: for line in f: word_set.update(line.strip().split()) with open(args.output_path, "w", encoding='utf8') as f: for word in sorted(word_set): pronunciation = list(word) if list(word)[0] != '[' and list(word)[0] != '{' and list( word)[0] != '<': f.write('{} {}\n'.format(word, ' '.join(pronunciation)))
def generate_g2p_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR temp_dir = os.path.join(temp_dir, 'G2P') else: temp_dir = os.path.expanduser(args.temp_directory) if os.path.isdir(args.input_path): input_dir = os.path.expanduser(args.input_path) corpus_name = os.path.basename(args.input_path) if corpus_name == '': args.input_path = os.path.dirname(args.input_path) corpus_name = os.path.basename(args.input_path) data_directory = os.path.join(temp_dir, corpus_name) corpus = Corpus(input_dir, data_directory) word_set = get_word_set(corpus, args.include_bracketed) else: word_set = set() with open(args.input_path, 'r', encoding='utf8') as f: for line in f: word_set.update(line.strip().split()) model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, word_set, args.output_path, temp_directory=temp_dir) gen.generate()
def sick_corpus(sick_dict, basic_dir, generated_dir): output_directory = os.path.join(generated_dir, 'sickcorpus') corpus = Corpus(basic_dir, output_directory, num_jobs=2) corpus.write() corpus.create_mfccs() corpus.setup_splits(sick_dict) return corpus
def test_acoustic(basic_corpus_dir, generated_dir): output_directory = os.path.join(generated_dir, 'acoustic') d = Corpus(basic_corpus_dir, output_directory) n = no_dictionary(d, output_directory) d.initialize_corpus(n) assert n.words['should'][0][0] == ('s', 'h', 'o', 'u', 'l', 'd') assert '<vocnoise>' not in n.words assert n.words['here\'s'][0][0] == ('h', 'e', 'r', 'e', 's')
def test_vietnamese(vietnamese_corpus_dir, temp_dir): output_directory = os.path.join(temp_dir, 'vietnamese') d = Corpus(vietnamese_corpus_dir, output_directory) n = no_dictionary(d, output_directory) d.initialize_corpus(n) assert n.words['chăn'][0][0] == ('c', 'h', 'ă', 'n') assert '<vocnoise>' not in n.words assert n.words['tập'][0][0] == ('t', 'ậ', 'p')
def test_stereo(basic_dict_path, textgrid_directory, generated_dir): temp = os.path.join(generated_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = Corpus(os.path.join(textgrid_directory, 'stereo'), temp) d.write() d.create_mfccs() d.setup_splits(dictionary) assert (d.get_feat_dim() == '39')
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'stereo') dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic')) dictionary.write() d = Corpus(stereo_corpus_dir, temp) d.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(d) assert d.get_feat_dim(fc) == 39
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') c = Corpus(basic_corpus_dir, output_directory) c.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(c) assert c.get_feat_dim(fc) == 39
def test_acoustic(basic_dir, generated_dir): output_directory = os.path.join(generated_dir, 'acoustic') d = Corpus(basic_dir, output_directory) d.write() d.create_mfccs() n = no_dictionary(d, output_directory) d.setup_splits(n) assert n.words['should'] == [['s', 'h', 'o', 'u', 'l', 'd']] assert '<vocnoise>' not in n.words assert n.words['here\'s'] == [['h', 'e', 'r', 'e', 's']]
def test_vietnamese(textgrid_directory, generated_dir): output_directory = os.path.join(generated_dir, 'vietnamese') d = Corpus(os.path.join(textgrid_directory, 'vietnamese'), output_directory) d.write() d.create_mfccs() n = no_dictionary(d, output_directory) d.setup_splits(n) assert n.words['chăn'] == [['c', 'h', 'ă', 'n']] assert '<vocnoise>' not in n.words assert n.words['tập'] == [['t','ậ','p']]
def test_basic(basic_dict_path, basic_dir, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') d = Corpus(basic_dir, output_directory) d.write() d.create_mfccs() d.setup_splits(dictionary) assert (d.get_feat_dim() == '39')
def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir): dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic')) dictionary.write() output_directory = os.path.join(generated_dir, 'basic') c = Corpus(basic_corpus_txt_dir, output_directory) assert len(c.no_transcription_files) == 0 c.initialize_corpus(dictionary) fc = FeatureConfig() fc.generate_features(c) assert c.get_feat_dim(fc) == 39
def align_corpus(corpus_dir, dict_path, output_directory, temp_dir, output_model_path, args): if temp_dir == '': temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(temp_dir) corpus_name = os.path.basename(corpus_dir) if corpus_name == '': corpus_dir = os.path.dirname(corpus_dir) corpus_name = os.path.basename(corpus_dir) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(output_directory, exist_ok=True) dictionary = Dictionary(dict_path, data_directory) dictionary.write() corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs=args.num_jobs) print(corpus.speaker_utterance_info()) corpus.write() corpus.create_mfccs() corpus.setup_splits(dictionary) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, output_directory, temp_directory=data_directory, mono_params=mono_params, tri_params=tri_params, tri_fmllr_params=tri_fmllr_params, num_jobs=args.num_jobs) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if output_model_path is not None: a.save(output_model_path)
def test_speaker_groupings(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary): output_directory = os.path.join(temp_dir, 'large') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = Corpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) fc = FeatureConfig() fc.generate_features(c) speakers = os.listdir(large_prosodylab_format_directory) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping) shutil.rmtree(output_directory, ignore_errors=True) d.write() c = Corpus(large_prosodylab_format_directory, output_directory, num_jobs=2) c.initialize_corpus(d) fc.generate_features(c) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping)
def align_corpus(model_path, corpus_dir, output_directory, temp_dir, args, debug = False): all_begin = time.time() if temp_dir == '': temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(temp_dir) corpus_name = os.path.basename(corpus_dir) if corpus_name == '': corpus_dir = os.path.dirname(corpus_dir) corpus_name = os.path.basename(corpus_dir) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors = True) shutil.rmtree(output_directory, ignore_errors = True) os.makedirs(data_directory, exist_ok = True) os.makedirs(output_directory, exist_ok = True) begin = time.time() corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs = args.num_jobs) print(corpus.speaker_utterance_info()) corpus.write() if debug: print('Wrote corpus information in {} seconds'.format(time.time() - begin)) begin = time.time() corpus.create_mfccs() if debug: print('Calculated mfccs in {} seconds'.format(time.time() - begin)) archive = Archive(model_path) begin = time.time() a = PretrainedAligner(archive, corpus, output_directory, temp_directory = data_directory, num_jobs = args.num_jobs, speaker_independent = args.no_speaker_adaptation) if debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() corpus.setup_splits(a.dictionary) if debug: print('Setup splits in {} seconds'.format(time.time() - begin)) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, output_directory) begin = time.time() a.do_align() if debug: print('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids() if debug: print('Exported textgrids in {} seconds'.format(time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin))
def test_short_segments(textgrid_directory, generated_dir): temp = os.path.join(generated_dir, 'short_segments') corpus = Corpus(os.path.join(textgrid_directory, 'short_segments'), temp) corpus.write() corpus.create_mfccs() assert (len(corpus.feat_mapping.keys()) == 2) assert (len(corpus.utt_speak_mapping.keys()) == 2) assert (len(corpus.speak_utt_mapping.keys()) == 1) assert (len(corpus.text_mapping.keys()) == 2) assert (len(corpus.utt_wav_mapping.keys()) == 1) assert (len(corpus.segments.keys()) == 2) assert (len(corpus.ignored_utterances) == 1)
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'short_segments') dictionary = Dictionary(basic_dict_path, temp) dictionary.write() corpus = Corpus(shortsegments_corpus_dir, temp) corpus.initialize_corpus(dictionary) assert (len(corpus.feat_mapping.keys()) == 2) assert (len(corpus.utt_speak_mapping.keys()) == 2) assert (len(corpus.speak_utt_mapping.keys()) == 1) assert (len(corpus.text_mapping.keys()) == 2) assert (len(corpus.utt_wav_mapping.keys()) == 1) assert (len(corpus.segments.keys()) == 2) assert (len(corpus.ignored_utterances) == 1)
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary): output_directory = os.path.join(temp_dir, 'large_subset') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = Corpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) sd = c.split_directory() fc = FeatureConfig() fc.generate_features(c) s = c.subset_directory(10, fc) assert os.path.exists(sd) assert os.path.exists(s)
def generate_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) input_dir = os.path.expanduser(args.corpus_directory) corpus = Corpus(input_dir, os.path.join(temp_dir, 'corpus')) word_set = corpus.word_set model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, word_set, args.output_path, temp_directory=temp_dir) gen.generate()
def generate_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) input_dir = os.path.expanduser(args.corpus_directory) corpus = Corpus(input_dir, "") model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, corpus, args.output_path, temp_directory=temp_dir, korean=args.korean) gen.generate()
def align_corpus_no_dict(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) if args.clean: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(args.output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) corpus = Corpus(args.corpus_directory, data_directory, args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), ignore_exceptions=getattr(args, 'ignore_exceptions', False)) print(corpus.speaker_utterance_info()) dictionary = no_dictionary(corpus, data_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, args.output_directory, temp_directory=data_directory, mono_params=mono_params, tri_params=tri_params, tri_fmllr_params=tri_fmllr_params, num_jobs=args.num_jobs, debug=args.debug, skip_input=getattr(args, 'quiet', False)) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if args.output_model_path is not None: a.save(args.output_model_path)
def generate_dictionary(args): print("Generating pronunciations from G2P model") if not args.temp_directory: temp_dir = TEMP_DIR temp_dir = os.path.join(temp_dir, 'G2P') else: temp_dir = os.path.expanduser(args.temp_directory) if os.path.isdir(args.input_path): input_dir = os.path.expanduser(args.input_path) corpus_name = os.path.basename(args.input_path) if corpus_name == '': args.input_path = os.path.dirname(args.input_path) corpus_name = os.path.basename(args.input_path) data_directory = os.path.join(temp_dir, corpus_name) corpus = Corpus(input_dir, data_directory) word_set = get_word_set(corpus, args.include_bracketed) else: word_set = set() with open(args.input_path, 'r', encoding='utf8') as f: for line in f: word_set.update(line.strip().split()) if not args.include_bracketed: word_set = [x for x in word_set if not check_bracketed(x)] if args.g2p_model_path is not None: model = G2PModel(args.g2p_model_path) gen = PhonetisaurusDictionaryGenerator(model, word_set, args.output_path, temp_directory=temp_dir) gen.generate() else: with open(args.output_path, "w", encoding='utf8') as f: for word in sorted(word_set): pronunciation = list(word) f.write('{} {}\n'.format(word, ' '.join(pronunciation)))
def align_corpus(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'train_and_align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'train_and_align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), ignore_exceptions=getattr(args, 'ignore_exceptions', False)) if corpus.issues_check: print('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train() a = TrainableAligner(corpus, dictionary, train_config, align_config, args.output_directory, temp_directory=data_directory) a.verbose = args.verbose a.train() a.export_textgrids() if args.output_model_path is not None: a.save(args.output_model_path) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def align_corpus(args, skip_input=False): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'train_and_align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'train_and_align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(args.output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), ignore_exceptions=getattr(args, 'ignore_exceptions', False)) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, args.output_directory, temp_directory=data_directory, mono_params=mono_params, tri_params=tri_params, tri_fmllr_params=tri_fmllr_params, num_jobs=args.num_jobs) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if args.output_model_path is not None: a.save(args.output_model_path) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def align_corpus(args, skip_input=False): all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f) else: conf = {'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path} if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'align' \ or conf['corpus_directory'] != args.corpus_directory\ or conf['version'] != __version__\ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(args.output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) use_speaker_info = not args.no_speaker_adaptation try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, use_speaker_information=use_speaker_info, ignore_exceptions=getattr(args, 'ignore_exceptions', False)) print(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) acoustic_model.validate(dictionary) begin = time.time() a = PretrainedAligner(corpus, dictionary, acoustic_model, args.output_directory, temp_directory=data_directory, num_jobs=getattr(args, 'num_jobs', 3), speaker_independent=getattr(args, 'no_speaker_adaptation', False), debug=getattr(args, 'debug', False)) if getattr(args, 'errors', False): check = a.test_utterance_transcriptions() if not skip_input and not check: user_input = input('Would you like to abort to fix transcription issues? (Y/N)') if user_input.lower() == 'y': return if args.debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) if not skip_input and a.dictionary.oovs_found: user_input = input( 'There were words not found in the dictionary. Would you like to abort to fix them? (Y/N)') if user_input.lower() == 'y': return begin = time.time() a.do_align() if args.debug: print('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids() if args.debug: print('Exported TextGrids in {} seconds'.format(time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin)) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def sick_corpus(basic_corpus_dir, generated_dir): output_directory = os.path.join(generated_dir, 'sickcorpus') corpus = Corpus(basic_corpus_dir, output_directory, num_jobs=2) return corpus