def main():
    parser = _get_parser()
    opt = parser.parse_args()
    log_info(opt)

    preprocess(opt.dataset_path, opt.preprocess_mode)

    if opt.output_unit == 'character':
        generate_character_labels(opt.dataset_path, opt.labels_dest)
        generate_character_script(opt.dataset_path, opt.new_path,
                                  opt.script_prefix, opt.labels_dest)

    elif opt.output_unit == 'subword':
        generate_sentencepiece_input(opt.dataset_path)
        if not opt.use_pretrain_kobert_tokenizer:
            train_sentencepiece(opt.dataset_path, opt.vocab_size)
        generate_subword_labels('aihub_sentencepiece.vocab', opt.labels_dest,
                                opt.use_pretrain_kobert_tokenizer)
        generate_subword_script(opt.dataset_path, opt.new_path,
                                opt.script_prefix)

    elif opt.output_unit == 'grapheme':
        character_to_grapheme(opt.dataset_path, opt.grapheme_save_path)
        generate_grapheme_labels(opt.grapheme_save_path, opt.labels_dest)
        generate_grapheme_script(opt.grapheme_save_path, opt.new_path,
                                 opt.script_prefix, opt.labels_dest)

    else:
        raise ValueError("Unsupported preprocess method : {0}".format(
            opt.output_unit))

    gather_files(opt.dataset_path, opt.new_path)
Beispiel #2
0
def main():
    parser = _get_parser()
    opt = parser.parse_args()
    log_info(opt)

    audio_paths, transcripts = preprocess(opt.dataset_path, opt.preprocess_mode)

    if opt.output_unit == 'character':
        generate_character_labels(transcripts, opt.vocab_dest)
        generate_character_script(audio_paths, transcripts, opt.vocab_dest)

    elif opt.output_unit == 'subword':
        train_sentencepiece(transcripts, opt.savepath, opt.vocab_size)
        sentence_to_subwords(audio_paths, transcripts, opt.savepath)

    elif opt.output_unit == 'grapheme':
        sentence_to_grapheme(audio_paths, transcripts, opt.vocab_dest)

    else:
        raise ValueError("Unsupported preprocess method : {0}".format(opt.output_unit))