def main(): parser = _get_parser() opt = parser.parse_args() log_info(opt) preprocess(opt.dataset_path, opt.preprocess_mode) if opt.output_unit == 'character': generate_character_labels(opt.dataset_path, opt.labels_dest) generate_character_script(opt.dataset_path, opt.new_path, opt.script_prefix, opt.labels_dest) elif opt.output_unit == 'subword': generate_sentencepiece_input(opt.dataset_path) if not opt.use_pretrain_kobert_tokenizer: train_sentencepiece(opt.dataset_path, opt.vocab_size) generate_subword_labels('aihub_sentencepiece.vocab', opt.labels_dest, opt.use_pretrain_kobert_tokenizer) generate_subword_script(opt.dataset_path, opt.new_path, opt.script_prefix) elif opt.output_unit == 'grapheme': character_to_grapheme(opt.dataset_path, opt.grapheme_save_path) generate_grapheme_labels(opt.grapheme_save_path, opt.labels_dest) generate_grapheme_script(opt.grapheme_save_path, opt.new_path, opt.script_prefix, opt.labels_dest) else: raise ValueError("Unsupported preprocess method : {0}".format( opt.output_unit)) gather_files(opt.dataset_path, opt.new_path)
def main(): parser = _get_parser() opt = parser.parse_args() log_info(opt) audio_paths, transcripts = preprocess(opt.dataset_path, opt.preprocess_mode) if opt.output_unit == 'character': generate_character_labels(transcripts, opt.vocab_dest) generate_character_script(audio_paths, transcripts, opt.vocab_dest) elif opt.output_unit == 'subword': train_sentencepiece(transcripts, opt.savepath, opt.vocab_size) sentence_to_subwords(audio_paths, transcripts, opt.savepath) elif opt.output_unit == 'grapheme': sentence_to_grapheme(audio_paths, transcripts, opt.vocab_dest) else: raise ValueError("Unsupported preprocess method : {0}".format(opt.output_unit))