def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary):
    output_directory = os.path.join(temp_dir, 'large_subset')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory)
    c.initialize_corpus(d)
    sd = c.split_directory()

    fc = FeatureConfig()
    fc.generate_features(c)
    s = c.subset_directory(10, fc)
    assert os.path.exists(sd)
    assert os.path.exists(s)
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary):
    output_directory = os.path.join(temp_dir, 'large_subset')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory)
    c.initialize_corpus(d)
    sd = c.split_directory()

    fc = FeatureConfig()
    fc.generate_features(c)
    s = c.subset_directory(10, fc)
    assert os.path.exists(sd)
    assert os.path.exists(s)
def align_corpus(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    conf_path = os.path.join(data_directory, 'config.yml')
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f)
    else:
        conf = {
            'dirty': False,
            'begin': time.time(),
            'version': __version__,
            'type': 'train_and_align',
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path
        }
    if getattr(args, 'clean', False) \
            or conf['dirty'] or conf['type'] != 'train_and_align' \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    try:
        corpus = Corpus(args.corpus_directory,
                        data_directory,
                        speaker_characters=args.speaker_characters,
                        num_jobs=getattr(args, 'num_jobs', 3),
                        debug=getattr(args, 'debug', False),
                        ignore_exceptions=getattr(args, 'ignore_exceptions',
                                                  False))
        if corpus.issues_check:
            print('WARNING: Some issues parsing the corpus were detected. '
                  'Please run the validator to get more information.')
        dictionary = Dictionary(args.dictionary_path,
                                data_directory,
                                word_set=corpus.word_set)
        utt_oov_path = os.path.join(corpus.split_directory(),
                                    'utterance_oovs.txt')
        if os.path.exists(utt_oov_path):
            shutil.copy(utt_oov_path, args.output_directory)
        oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt')
        if os.path.exists(oov_path):
            shutil.copy(oov_path, args.output_directory)
        if args.config_path:
            train_config, align_config = train_yaml_to_config(args.config_path)
        else:
            train_config, align_config = load_basic_train()
        a = TrainableAligner(corpus,
                             dictionary,
                             train_config,
                             align_config,
                             args.output_directory,
                             temp_directory=data_directory)
        a.verbose = args.verbose
        a.train()
        a.export_textgrids()
        if args.output_model_path is not None:
            a.save(args.output_model_path)
    except:
        conf['dirty'] = True
        raise
    finally:
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)
Beispiel #4
0
def align_corpus(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == "":
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    conf_path = os.path.join(data_directory, "config.yml")
    if os.path.exists(conf_path):
        with open(conf_path, "r") as f:
            conf = yaml.load(f)
    else:
        conf = {
            "dirty": False,
            "begin": time.time(),
            "version": __version__,
            "type": "train_and_align",
            "corpus_directory": args.corpus_directory,
            "dictionary_path": args.dictionary_path,
        }
    if (
        getattr(args, "clean", False)
        or conf["dirty"]
        or conf["type"] != "train_and_align"
        or conf["corpus_directory"] != args.corpus_directory
        or conf["version"] != __version__
        or conf["dictionary_path"] != args.dictionary_path
    ):
        shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    try:
        corpus = Corpus(
            args.corpus_directory,
            data_directory,
            speaker_characters=args.speaker_characters,
            num_jobs=getattr(args, "num_jobs", 3),
            debug=getattr(args, "debug", False),
            ignore_exceptions=getattr(args, "ignore_exceptions", False),
        )
        if corpus.issues_check:
            print(
                "WARNING: Some issues parsing the corpus were detected. "
                "Please run the validator to get more information."
            )
        dictionary = Dictionary(
            args.dictionary_path, data_directory, word_set=corpus.word_set
        )
        utt_oov_path = os.path.join(corpus.split_directory(), "utterance_oovs.txt")
        if os.path.exists(utt_oov_path):
            shutil.copy(utt_oov_path, args.output_directory)
        oov_path = os.path.join(corpus.split_directory(), "oovs_found.txt")
        if os.path.exists(oov_path):
            shutil.copy(oov_path, args.output_directory)
        if args.config_path:
            train_config, align_config = train_yaml_to_config(args.config_path)
        else:
            train_config, align_config = load_basic_train()
        a = TrainableAligner(
            corpus,
            dictionary,
            train_config,
            align_config,
            args.output_directory,
            temp_directory=data_directory,
        )
        a.verbose = args.verbose
        a.train()
        a.export_textgrids()
        if args.output_model_path is not None:
            a.save(args.output_model_path)
    except:
        conf["dirty"] = True
        raise
    finally:
        with open(conf_path, "w") as f:
            yaml.dump(conf, f)
def align_corpus(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    conf_path = os.path.join(data_directory, 'config.yml')
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f)
    else:
        conf = {'dirty': False,
                'begin': time.time(),
                'version': __version__,
                'type': 'train_and_align',
                'corpus_directory': args.corpus_directory,
                'dictionary_path': args.dictionary_path}
    if getattr(args, 'clean', False) \
            or conf['dirty'] or conf['type'] != 'train_and_align' \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    try:
        corpus = Corpus(args.corpus_directory, data_directory, speaker_characters=args.speaker_characters,
                        num_jobs=getattr(args, 'num_jobs', 3),
                        debug=getattr(args, 'debug', False),
                        ignore_exceptions=getattr(args, 'ignore_exceptions', False))
        if corpus.issues_check:
            print('WARNING: Some issues parsing the corpus were detected. '
                  'Please run the validator to get more information.')
        dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set)
        utt_oov_path = os.path.join(corpus.split_directory(), 'utterance_oovs.txt')
        if os.path.exists(utt_oov_path):
            shutil.copy(utt_oov_path, args.output_directory)
        oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt')
        if os.path.exists(oov_path):
            shutil.copy(oov_path, args.output_directory)
        if args.config_path:
            train_config, align_config = train_yaml_to_config(args.config_path)
        else:
            train_config, align_config = load_basic_train()
        a = TrainableAligner(corpus, dictionary, train_config, align_config, args.output_directory,
                             temp_directory=data_directory)
        a.verbose = args.verbose
        a.train()
        a.export_textgrids()
        if args.output_model_path is not None:
            a.save(args.output_model_path)
    except:
        conf['dirty'] = True
        raise
    finally:
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)