def test_generate_dict(basic_corpus_dir, sick_g2p_model_path, g2p_sick_output, temp_dir): args = G2PDummyArgs() args.g2p_model_path = sick_g2p_model_path args.input_path = basic_corpus_dir args.output_path = g2p_sick_output args.temp_directory = temp_dir run_g2p(args) assert os.path.exists(g2p_sick_output) d = Dictionary(g2p_sick_output, temp_dir) assert len(d.words) > 0
def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output, temp_dir): args = G2PDummyArgs() args.g2p_model_path = None args.input_path = basic_corpus_dir args.output_path = orth_sick_output args.temp_directory = temp_dir generate_orthography_dict(args) assert os.path.exists(orth_sick_output) d = Dictionary(orth_sick_output, temp_dir) assert len(d.words) > 0
def train_g2p(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) dictionary = Dictionary(args.dictionary_path, '') t = PhonetisaurusTrainer(dictionary, args.output_model_path, temp_directory=temp_dir, korean=args.korean) t.train()
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir): temp = os.path.join(temp_dir, 'short_segments') dictionary = Dictionary(basic_dict_path, temp) dictionary.write() corpus = Corpus(shortsegments_corpus_dir, temp) corpus.initialize_corpus(dictionary) assert (len(corpus.feat_mapping.keys()) == 2) assert (len(corpus.utt_speak_mapping.keys()) == 2) assert (len(corpus.speak_utt_mapping.keys()) == 1) assert (len(corpus.text_mapping.keys()) == 2) assert (len(corpus.utt_wav_mapping.keys()) == 1) assert (len(corpus.segments.keys()) == 2) assert (len(corpus.ignored_utterances) == 1)
def train_g2p(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) dictionary = Dictionary(args.dictionary_path, '') t = PhonetisaurusTrainer(dictionary, args.output_model_path, temp_directory=temp_dir, window_size=args.window_size) if args.validate: t.validate() t.train()
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary): output_directory = os.path.join(temp_dir, 'large_subset') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = Corpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) sd = c.split_directory() fc = FeatureConfig() fc.generate_features(c) s = c.subset_directory(10, fc) assert os.path.exists(sd) assert os.path.exists(s)
def test_frclitics(frclitics_dict_path, generated_dir): d = Dictionary(frclitics_dict_path, os.path.join(generated_dir, 'frclitics')) x = d.write() assert d.separate_clitics('aujourd') == ['aujourd'] assert d.separate_clitics('aujourd\'hui') == ['aujourd\'hui'] assert d.separate_clitics('vingt-six') == ['vingt', 'six'] assert d.separate_clitics('m\'appelle') == ['m\'', 'appelle'] assert d.separate_clitics('c\'est') == ['c\'est'] assert d.separate_clitics('purple-people-eater') == ['purple-people-eater'] assert d.separate_clitics('m\'appele') == ['m\'', 'appele'] assert d.separate_clitics('m\'ving-sic') == ["m'", 'ving', 'sic'] assert d.separate_clitics('flying\'purple-people-eater') == [ 'flying\'purple-people-eater' ]
def test_speaker_groupings(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary): output_directory = os.path.join(temp_dir, 'large') shutil.rmtree(output_directory, ignore_errors=True) d = Dictionary(large_dataset_dictionary, output_directory) d.write() c = Corpus(large_prosodylab_format_directory, output_directory) c.initialize_corpus(d) fc = FeatureConfig() fc.generate_features(c) speakers = os.listdir(large_prosodylab_format_directory) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping) shutil.rmtree(output_directory, ignore_errors=True) d.write() c = Corpus(large_prosodylab_format_directory, output_directory, num_jobs=2) c.initialize_corpus(d) fc.generate_features(c) for s in speakers: assert any(s in x for x in c.speaker_groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.groups) for root, dirs, files in os.walk(large_prosodylab_format_directory): for f in files: name, ext = os.path.splitext(f) assert any(name in x for x in c.feat_mapping)
def test_basic_noposition(basic_dict_path, generated_dir): d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'), position_dependent_phones=False) x = d.write() assert (set(d.phones) == set(['sil', 'spn', 'phonea', 'phoneb', 'phonec']))
def sick_dict(sick_dict_path, generated_dir): output_directory = os.path.join(generated_dir, 'sickcorpus') dictionary = Dictionary(sick_dict_path, output_directory) dictionary.write() return dictionary
def align_corpus(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == "": args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, "config.yml") if os.path.exists(conf_path): with open(conf_path, "r") as f: conf = yaml.load(f) else: conf = { "dirty": False, "begin": time.time(), "version": __version__, "type": "train_and_align", "corpus_directory": args.corpus_directory, "dictionary_path": args.dictionary_path, } if ( getattr(args, "clean", False) or conf["dirty"] or conf["type"] != "train_and_align" or conf["corpus_directory"] != args.corpus_directory or conf["version"] != __version__ or conf["dictionary_path"] != args.dictionary_path ): shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus( args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, "num_jobs", 3), debug=getattr(args, "debug", False), ignore_exceptions=getattr(args, "ignore_exceptions", False), ) if corpus.issues_check: print( "WARNING: Some issues parsing the corpus were detected. " "Please run the validator to get more information." ) dictionary = Dictionary( args.dictionary_path, data_directory, word_set=corpus.word_set ) utt_oov_path = os.path.join(corpus.split_directory(), "utterance_oovs.txt") if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), "oovs_found.txt") if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train() a = TrainableAligner( corpus, dictionary, train_config, align_config, args.output_directory, temp_directory=data_directory, ) a.verbose = args.verbose a.train() a.export_textgrids() if args.output_model_path is not None: a.save(args.output_model_path) except: conf["dirty"] = True raise finally: with open(conf_path, "w") as f: yaml.dump(conf, f)
def g2p_gp(lang_code, full_name): temp_directory = '/data/mmcauliffe/temp/MFA' dictionary_path = '/media/share/corpora/GP_for_MFA/{0}/dict/{0}_dictionary.txt'.format( lang_code) if not os.path.exists(dictionary_path): print('Skipping {}, no dictionary!'.format(lang_code)) return output_model_path = '/data/mmcauliffe/aligner-models/g2p/{}_g2p.zip'.format( full_name) if os.path.exists(output_model_path): print('Skipping {}, already a model!'.format(lang_code)) else: dictionary = Dictionary(dictionary_path, '') best_acc = 0 best_size = 0 for s in [2, 3, 4]: begin = time.time() t = PhonetisaurusTrainer(dictionary, output_model_path, temp_directory=temp_directory, window_size=s) acc = t.validate() duration = time.time() - begin line_dict = { 'Dictionary': dictionary_path, 'Language': lang_code, 'Total time': duration, 'Window size': s, 'Accuracy': acc } line_dict.update(dict_data) with open(csv_path, 'a') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=csv_columns) writer.writerow(line_dict) if acc > best_acc: best_acc = acc best_size = s print('The best window size for {} was {} with accuracy of {}.'.format( lang_code, best_size, best_acc)) t = PhonetisaurusTrainer(dictionary, output_model_path, temp_directory=temp_directory, window_size=best_size) t.train() if lang_code in ['FR', 'GE', 'CH']: if lang_code == 'FR': dictionary_path = '/media/share/corpora/GP_for_MFA/{0}/dict/fr.dict'.format( lang_code) output_model_path = '/data/mmcauliffe/aligner-models/g2p/{}_prosodylab_g2p.zip'.format( full_name) elif lang_code == 'GE': dictionary_path = '/media/share/corpora/GP_for_MFA/{0}/dict/de.dict'.format( lang_code) output_model_path = '/data/mmcauliffe/aligner-models/g2p/{}_prosodylab_g2p.zip'.format( full_name) elif lang_code == 'CH': dictionary_path = '/media/share/corpora/GP_for_MFA/{0}/dict/char_dict.txt'.format( lang_code) output_model_path = '/data/mmcauliffe/aligner-models/g2p/{}_character_g2p.zip'.format( full_name) if not os.path.exists(dictionary_path): print('Skipping {}, no dictionary!'.format(lang_code)) return if os.path.exists(output_model_path): print('Skipping {}, already a model!'.format(lang_code)) return temp_directory = '/data/mmcauliffe/temp/MFA' dictionary = Dictionary(dictionary_path, '') best_acc = 0 best_size = 0 for s in [2, 3, 4]: begin = time.time() t = PhonetisaurusTrainer(dictionary, output_model_path, temp_directory=temp_directory, window_size=s) acc = t.validate() duration = time.time() - begin line_dict = { 'Dictionary': dictionary_path, 'Language': lang_code, 'Total time': duration, 'Window size': s, 'Accuracy': acc } line_dict.update(dict_data) with open(csv_path, 'a') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=csv_columns) writer.writerow(line_dict) if acc > best_acc: best_acc = acc best_size = s print('The best window size for {} was {} with accuracy of {}.'.format( lang_code, best_size, best_acc)) t = PhonetisaurusTrainer(dictionary, output_model_path, temp_directory=temp_directory, window_size=best_size) t.train()
def test_extra_annotations(extra_annotations_path, generated_dir): d = Dictionary(extra_annotations_path, os.path.join(generated_dir, 'extra')) assert ('{' in d.graphemes) d.write()
def align_corpus(args): all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, ignore_exceptions=getattr(args, 'ignore_exceptions', False)) if corpus.issues_check: print('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') print(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) acoustic_model.validate(dictionary) begin = time.time() if args.config_path: align_config = align_yaml_to_config(args.config_path) else: align_config = load_basic_align() a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config, args.output_directory, temp_directory=data_directory, debug=getattr(args, 'debug', False)) if args.debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose begin = time.time() a.align() if args.debug: print('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids() if args.debug: print('Exported TextGrids in {} seconds'.format(time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin)) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def align_corpus(args): all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == "": args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, "config.yml") if os.path.exists(conf_path): with open(conf_path, "r") as f: conf = yaml.load(f) else: conf = { "dirty": False, "begin": time.time(), "version": __version__, "type": "align", "corpus_directory": args.corpus_directory, "dictionary_path": args.dictionary_path, } if ( getattr(args, "clean", False) or conf["dirty"] or conf["type"] != "align" or conf["corpus_directory"] != args.corpus_directory or conf["version"] != __version__ or conf["dictionary_path"] != args.dictionary_path ): shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus( args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, ignore_exceptions=getattr(args, "ignore_exceptions", False), ) if corpus.issues_check: print( "WARNING: Some issues parsing the corpus were detected. " "Please run the validator to get more information." ) print(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary( args.dictionary_path, data_directory, word_set=corpus.word_set ) acoustic_model.validate(dictionary) begin = time.time() if args.config_path: align_config = align_yaml_to_config(args.config_path) else: align_config = load_basic_align() a = PretrainedAligner( corpus, dictionary, acoustic_model, align_config, args.output_directory, temp_directory=data_directory, debug=getattr(args, "debug", False), ) if getattr(args, "errors", False): check = a.test_utterance_transcriptions() if not getattr(args, "quiet", False) and not check: user_input = input( "Would you like to abort to fix transcription issues? (Y/N)" ) if user_input.lower() == "y": return if args.debug: print("Setup pretrained aligner in {} seconds".format(time.time() - begin)) a.verbose = args.verbose begin = time.time() a.align() if args.debug: print("Performed alignment in {} seconds".format(time.time() - begin)) begin = time.time() a.export_textgrids() if args.debug: print("Exported TextGrids in {} seconds".format(time.time() - begin)) print("Done! Everything took {} seconds".format(time.time() - all_begin)) except: conf["dirty"] = True raise finally: with open(conf_path, "w") as f: yaml.dump(conf, f)
def align_corpus(args, skip_input=False): all_begin = time.time() if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f) else: conf = {'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path} if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'align' \ or conf['corpus_directory'] != args.corpus_directory\ or conf['version'] != __version__\ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(args.output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) use_speaker_info = not args.no_speaker_adaptation try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, use_speaker_information=use_speaker_info, ignore_exceptions=getattr(args, 'ignore_exceptions', False)) print(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) acoustic_model.validate(dictionary) begin = time.time() a = PretrainedAligner(corpus, dictionary, acoustic_model, args.output_directory, temp_directory=data_directory, num_jobs=getattr(args, 'num_jobs', 3), speaker_independent=getattr(args, 'no_speaker_adaptation', False), debug=getattr(args, 'debug', False)) if getattr(args, 'errors', False): check = a.test_utterance_transcriptions() if not skip_input and not check: user_input = input('Would you like to abort to fix transcription issues? (Y/N)') if user_input.lower() == 'y': return if args.debug: print('Setup pretrained aligner in {} seconds'.format(time.time() - begin)) a.verbose = args.verbose utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) if not skip_input and a.dictionary.oovs_found: user_input = input( 'There were words not found in the dictionary. Would you like to abort to fix them? (Y/N)') if user_input.lower() == 'y': return begin = time.time() a.do_align() if args.debug: print('Performed alignment in {} seconds'.format(time.time() - begin)) begin = time.time() a.export_textgrids() if args.debug: print('Exported TextGrids in {} seconds'.format(time.time() - begin)) print('Done! Everything took {} seconds'.format(time.time() - all_begin)) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def align_corpus(args, skip_input=False): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'train_and_align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'train_and_align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(args.output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), ignore_exceptions=getattr(args, 'ignore_exceptions', False)) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, args.output_directory, temp_directory=data_directory, mono_params=mono_params, tri_params=tri_params, tri_fmllr_params=tri_fmllr_params, num_jobs=args.num_jobs) a.verbose = args.verbose a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() if args.output_model_path is not None: a.save(args.output_model_path) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
# Copy text file open(os.path.join(corpse_dir_in, '1.lab'), 'w').write(open(TEXT_PATH).read()) corpus = Corpus(corpse_dir_in, corpse_dir_out) acoustic_model = AcousticModel('spanish.zip') g2p_model = G2PModel('spanish_g2p.zip') dict_dir = tempfile.mkdtemp() with tempfile.NamedTemporaryFile() as g2pfh: d_gen = PhonetisaurusDictionaryGenerator(g2p_model, WORDS, g2pfh.name) d_gen.generate() dictionary = Dictionary(g2pfh.name, dict_dir) acoustic_model.validate(dictionary) aligner = PretrainedAligner(corpus, dictionary, acoustic_model, outdir, temp_directory=corpse_dir_tmp) check = aligner.test_utterance_transcriptions() aligner.do_align() aligner.export_textgrids() grid = TextGrid.fromFile(os.path.join(outdir, 'in', '1.TextGrid'))
def align_corpus(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'train_and_align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'train_and_align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), ignore_exceptions=getattr(args, 'ignore_exceptions', False)) if corpus.issues_check: print('WARNING: Some issues parsing the corpus were detected. ' 'Please run the validator to get more information.') dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) if args.config_path: train_config, align_config = train_yaml_to_config(args.config_path) else: train_config, align_config = load_basic_train() a = TrainableAligner(corpus, dictionary, train_config, align_config, args.output_directory, temp_directory=data_directory) a.verbose = args.verbose a.train() a.export_textgrids() if args.output_model_path is not None: a.save(args.output_model_path) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)
def align_corpus(args): if not args.temp_directory: temp_dir = TEMP_DIR else: temp_dir = os.path.expanduser(args.temp_directory) corpus_name = os.path.basename(args.corpus_directory) if corpus_name == '': args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, 'config.yml') if os.path.exists(conf_path): with open(conf_path, 'r') as f: conf = yaml.load(f) else: conf = { 'dirty': False, 'begin': time.time(), 'version': __version__, 'type': 'train_and_align', 'corpus_directory': args.corpus_directory, 'dictionary_path': args.dictionary_path } if getattr(args, 'clean', False) \ or conf['dirty'] or conf['type'] != 'train_and_align' \ or conf['corpus_directory'] != args.corpus_directory \ or conf['version'] != __version__ \ or conf['dictionary_path'] != args.dictionary_path: shutil.rmtree(data_directory, ignore_errors=True) shutil.rmtree(args.output_directory, ignore_errors=True) os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters, num_jobs=getattr(args, 'num_jobs', 3), debug=getattr(args, 'debug', False), ignore_exceptions=getattr(args, 'ignore_exceptions', False)) dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set) utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt') if os.path.exists(utt_oov_path): shutil.copy(utt_oov_path, args.output_directory) oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt') if os.path.exists(oov_path): shutil.copy(oov_path, args.output_directory) mono_params = {'align_often': not args.fast} tri_params = {'align_often': not args.fast} tri_fmllr_params = {'align_often': not args.fast} a = TrainableAligner(corpus, dictionary, args.output_directory, temp_directory=data_directory, mono_params=mono_params, tri_params=tri_params, tri_fmllr_params=tri_fmllr_params, num_jobs=args.num_jobs, skip_input=getattr(args, 'quiet', False), nnet=getattr(args, 'artificial_neural_net', False)) a.verbose = args.verbose # GMM training (looks like it needs to be done either way, as a starter for nnet) a.train_mono() a.export_textgrids() a.train_tri() a.export_textgrids() a.train_tri_fmllr() a.export_textgrids() # nnet training if args.artificial_neural_net: # Do nnet training a.train_lda_mllt() #a.train_diag_ubm() # Uncomment to train i-vector extractor #a.ivector_extractor() # Uncomment to train i-vector extractor (integrate with argument eventually) a.train_nnet_basic() a.export_textgrids() if args.output_model_path is not None: a.save(args.output_model_path) except: conf['dirty'] = True raise finally: with open(conf_path, 'w') as f: yaml.dump(conf, f)