Exemple #1
0
def test_load(config_directory):
    path = os.path.join(config_directory, 'basic_train_config.yaml')
    train, align = train_yaml_to_config(path)
    assert len(train.training_configs) == 4
    assert isinstance(train.training_configs[0], MonophoneTrainer)
    assert isinstance(train.training_configs[1], TriphoneTrainer)
    assert isinstance(train.training_configs[-1], SatTrainer)

    path = os.path.join(config_directory, 'out_of_order_config.yaml')
    with pytest.raises(ConfigError):
        train, align = train_yaml_to_config(path)
Exemple #2
0
def test_load_mono_train(config_directory, mono_train_config_path):
    train, align = train_yaml_to_config(mono_train_config_path)
    for t in train.training_configs:
        assert not t.use_mp
        assert not t.feature_config.use_mp
    assert not align.use_mp
    assert not align.feature_config.use_mp
def test_load_ivector_train(config_directory, train_ivector_config):
    train, align = train_yaml_to_config(train_ivector_config)
    for t in train.training_configs:
        assert not t.use_mp
        assert not t.feature_config.use_mp
        assert t.feature_config.use_energy
    assert not align.use_mp
    assert not align.feature_config.use_mp
def ivector_train_config(config_directory):
    return train_yaml_to_config(
        os.path.join(config_directory, 'ivector_train.yaml'))
def lda_sat_train_config(config_directory):
    return train_yaml_to_config(
        os.path.join(config_directory, 'lda_sat_train.yaml'))
def mono_train_config(mono_train_config_path):
    return train_yaml_to_config(mono_train_config_path)
Exemple #7
0
def nnet_ivectors_train_config(config_directory):
    return train_yaml_to_config(
        os.path.join(config_directory, 'nnet_ivectors_train.yaml'))
Exemple #8
0
def train_ivector(args):
    command = 'train_ivector'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    if args.config_path:
        train_config, align_config = train_yaml_to_config(args.config_path)
    else:
        train_config, align_config = load_basic_train_ivector()
    conf_path = os.path.join(data_directory, 'config.yml')
    if getattr(args, 'clean', False) and os.path.exists(data_directory):
        print('Cleaning old directory!')
        shutil.rmtree(data_directory, ignore_errors=True)
    logger = setup_logger(command, data_directory)

    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': all_begin,
            'version': __version__,
            'type': command,
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path,
            'acoustic_model_path': args.acoustic_model_path,
        }
    if conf['dirty'] or conf['type'] != command \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path \
            or conf['acoustic_model_path'] != args.acoustic_model_path:
        logger.warning(
            'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no '
            'weird behavior for previous versions of the temporary directory.')
        if conf['dirty']:
            logger.debug('Previous run ended in an error (maybe ctrl-c?)')
        if conf['type'] != command:
            logger.debug(
                'Previous run was a different subcommand than {} (was {})'.
                format(command, conf['type']))
        if conf['corpus_directory'] != args.corpus_directory:
            logger.debug('Previous run used source directory '
                         'path {} (new run: {})'.format(
                             conf['corpus_directory'], args.corpus_directory))
        if conf['version'] != __version__:
            logger.debug('Previous run was on {} version (new run: {})'.format(
                conf['version'], __version__))
        if conf['dictionary_path'] != args.dictionary_path:
            logger.debug('Previous run used dictionary path {} '
                         '(new run: {})'.format(conf['dictionary_path'],
                                                args.dictionary_path))
        if conf['acoustic_model_path'] != args.acoustic_model_path:
            logger.debug('Previous run used acoustic model path {} '
                         '(new run: {})'.format(conf['acoustic_model_path'],
                                                args.acoustic_model_path))

    os.makedirs(data_directory, exist_ok=True)
    try:
        begin = time.time()
        corpus = AlignableCorpus(args.corpus_directory,
                                 data_directory,
                                 speaker_characters=args.speaker_characters,
                                 num_jobs=args.num_jobs,
                                 debug=getattr(args, 'debug', False),
                                 logger=logger,
                                 use_mp=align_config.use_mp)
        acoustic_model = AcousticModel(args.acoustic_model_path)
        dictionary = Dictionary(args.dictionary_path,
                                data_directory,
                                word_set=corpus.word_set,
                                logger=logger)
        acoustic_model.validate(dictionary)
        a = PretrainedAligner(corpus,
                              dictionary,
                              acoustic_model,
                              align_config,
                              temp_directory=data_directory,
                              logger=logger)
        logger.debug(
            'Setup pretrained aligner in {} seconds'.format(time.time() -
                                                            begin))
        a.verbose = args.verbose
        begin = time.time()
        a.align()
        logger.debug('Performed alignment in {} seconds'.format(time.time() -
                                                                begin))
        for identifier, trainer in train_config.items():
            trainer.logger = logger
            if identifier != 'ivector':
                continue
            begin = time.time()
            trainer.init_training(identifier, data_directory, corpus,
                                  dictionary, a)
            trainer.train(call_back=print)
            logger.debug('Training took {} seconds'.format(time.time() -
                                                           begin))
            trainer.save(args.output_model_path)

        logger.info('All done!')
        logger.debug('Done! Everything took {} seconds'.format(time.time() -
                                                               all_begin))
    except Exception as e:
        conf['dirty'] = True
        raise e
    finally:
        handlers = logger.handlers[:]
        for handler in handlers:
            handler.close()
            logger.removeHandler(handler)
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)
Exemple #9
0
def train_ivector(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    conf_path = os.path.join(data_directory, 'config.yml')
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': time.time(),
            'version': __version__,
            'type': 'train_and_align',
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path
        }
    if getattr(args, 'clean', False) \
            or conf['dirty'] or conf['type'] != 'train_and_align' \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    try:
        corpus = AlignableCorpus(args.corpus_directory,
                                 data_directory,
                                 speaker_characters=args.speaker_characters,
                                 num_jobs=getattr(args, 'num_jobs', 3),
                                 debug=getattr(args, 'debug', False))
        if corpus.issues_check:
            print('WARNING: Some issues parsing the corpus were detected. '
                  'Please run the validator to get more information.')
        dictionary = Dictionary(args.dictionary_path,
                                data_directory,
                                word_set=corpus.word_set)
        utt_oov_path = os.path.join(corpus.split_directory(),
                                    'utterance_oovs.txt')
        if os.path.exists(utt_oov_path):
            shutil.copy(utt_oov_path, args.output_directory)
        oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt')
        if os.path.exists(oov_path):
            shutil.copy(oov_path, args.output_directory)
        if args.config_path:
            train_config, align_config = train_yaml_to_config(args.config_path)
        else:
            train_config, align_config = load_basic_train_ivector()
        a = TrainableAligner(corpus,
                             dictionary,
                             train_config,
                             align_config,
                             temp_directory=data_directory)
        a.verbose = args.verbose
        a.train()
        a.save(args.output_model_path)
    except Exception as e:
        conf['dirty'] = True
        raise e
    finally:
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)
def align_corpus(args, unknown_args=None):
    command = 'train_and_align'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    logger = setup_logger(command, data_directory)
    if args.config_path:
        train_config, align_config = train_yaml_to_config(args.config_path)
    else:
        train_config, align_config = load_basic_train()
    if unknown_args:
        align_config.update_from_args(unknown_args)
    conf_path = os.path.join(data_directory, 'config.yml')
    if args.debug:
        logger.warning(
            'Running in DEBUG mode, may have impact on performance and disk usage.'
        )
    if getattr(args, 'clean', False) and os.path.exists(data_directory):
        logger.info('Cleaning old directory!')
        shutil.rmtree(data_directory, ignore_errors=True)
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': time.time(),
            'version': __version__,
            'type': command,
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path
        }
    if  conf['dirty'] or conf['type'] != command \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        logger.warning(
            'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no '
            'weird behavior for previous versions of the temporary directory.')
        if conf['dirty']:
            logger.debug('Previous run ended in an error (maybe ctrl-c?)')
        if conf['type'] != command:
            logger.debug(
                'Previous run was a different subcommand than {} (was {})'.
                format(command, conf['type']))
        if conf['corpus_directory'] != args.corpus_directory:
            logger.debug('Previous run used source directory '
                         'path {} (new run: {})'.format(
                             conf['corpus_directory'], args.corpus_directory))
        if conf['version'] != __version__:
            logger.debug('Previous run was on {} version (new run: {})'.format(
                conf['version'], __version__))
        if conf['dictionary_path'] != args.dictionary_path:
            logger.debug('Previous run used dictionary path {} '
                         '(new run: {})'.format(conf['dictionary_path'],
                                                args.dictionary_path))

    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    try:
        corpus = AlignableCorpus(args.corpus_directory,
                                 data_directory,
                                 speaker_characters=args.speaker_characters,
                                 num_jobs=getattr(args, 'num_jobs', 3),
                                 debug=getattr(args, 'debug', False),
                                 logger=logger,
                                 use_mp=align_config.use_mp)
        if corpus.issues_check:
            logger.warning('Some issues parsing the corpus were detected. '
                           'Please run the validator to get more information.')
        logger.info(corpus.speaker_utterance_info())
        dictionary = Dictionary(args.dictionary_path,
                                data_directory,
                                word_set=corpus.word_set,
                                logger=logger)
        utt_oov_path = os.path.join(corpus.split_directory(),
                                    'utterance_oovs.txt')
        if os.path.exists(utt_oov_path):
            shutil.copy(utt_oov_path, args.output_directory)
        oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt')
        if os.path.exists(oov_path):
            shutil.copy(oov_path, args.output_directory)
        a = TrainableAligner(corpus,
                             dictionary,
                             train_config,
                             align_config,
                             temp_directory=data_directory,
                             logger=logger,
                             debug=getattr(args, 'debug', False))
        a.verbose = args.verbose
        begin = time.time()
        a.train()
        logger.debug('Training took {} seconds'.format(time.time() - begin))
        a.export_textgrids(args.output_directory)
        if args.output_model_path is not None:
            a.save(args.output_model_path)
        logger.info('All done!')
        logger.debug('Done! Everything took {} seconds'.format(time.time() -
                                                               all_begin))
    except Exception as _:
        conf['dirty'] = True
        raise
    finally:
        handlers = logger.handlers[:]
        for handler in handlers:
            handler.close()
            logger.removeHandler(handler)
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)