def is_valid(self, njobs=utils.default_njobs()): """Return True if the corpus is in a valid state""" try: self.validate(njobs=njobs) except IOError: return False return True
def add_parser(cls, subparsers, name=None): if name is None: name = cls.name # get basic parser init from AbstractCommand parser, dir_group = super(AbstractKaldiCommand, cls).add_parser(subparsers, name) # add a --recipe option parser.add_argument('--recipe', action='store_true', help=""" put the Kaldi recipe in <output_dir>/recipe, by default the recipe is deleted""") # add a --njobs option parser.add_argument('-j', '--njobs', type=int, metavar='<njobs>', default=utils.default_njobs(), help=""" number of jobs for parallel computation, because Kaldi used to run jobs per speakers, the number of jobs is min(<njobs>, corpus.nspeakers). Default is to launch %(default)s jobs.""") return parser, dir_group
def validate(self, njobs=utils.default_njobs()): """Validate speech corpus data Raise IOError on the first encoutered error, relies on the CorpusValidation class. """ CorpusValidation(self, njobs=njobs, log=self.log).validate()
def add_parser(cls, subparsers): """Return a default argument parser for corpus preparation""" parser = subparsers.add_parser(cls.preparator.name) parser.formatter_class = argparse.RawDescriptionHelpFormatter parser.description = textwrap.dedent(cls.long_description()) group = parser.add_argument_group('directories') default_input_dir = cls.preparator.default_input_dir() if default_input_dir is None: group.add_argument( 'input_dir', metavar='<input-dir>', help='root directory of the raw corpus distribution') else: group.add_argument( '-i', '--input-dir', metavar='<input-dir>', default=default_input_dir, help='root directory of the raw corpus distribution, ' 'default is %(default)s') group.add_argument( '-o', '--output-dir', metavar='<output-dir>', default=None, help='the prepared corpus is created in ' '<output-dir>/data, if not specified use {}.' .format(cls.default_output_dir())) parser.add_argument( '-v', '--verbose', action='store_true', help='display more messages to stdout') parser.add_argument( '-f', '--force', action='store_true', help='if specified, overwrite the output directory ' '<output-dir>/data. If not specified but the directory exists, ' 'the program detects desired wav files already present and ' 'do not convert them again.') parser.add_argument( '-j', '--njobs', type=int, default=utils.default_njobs(), metavar='<njobs>', help='number of jobs to launch when doing parallel ' 'computations (mainly for wav conversion). ' 'Default is to launch %(default)s jobs.') parser.add_argument( '--keep-short-utts', action='store_true', help='utterances shorter than 0.1 second are removed by defaults, ' "as they won't be accepted by Kaldi for feature extraction. " "Use this option to keep those short utterances in the corpus.") if cls.preparator.audio_format == 'wav': parser.add_argument( '--copy-wavs', action='store_true', help='the audio files of this corpus are already in wav. ' 'By default abkhazia will import them as symbolic links, ' 'use this option to force copy') return parser
def __init__(self, input_dir, log=utils.logger.null_logger()): self.njobs = utils.default_njobs(local=True) self.log = log # init input directory if not os.path.isdir(input_dir): raise IOError( 'input directory does not exist:\n{}'.format(input_dir)) self.input_dir = os.path.abspath(input_dir) # init empty output corpus self.corpus = abkhazia.corpus.Corpus() self.corpus.meta.source = self.input_dir self.corpus.meta.name = self.name
def __init__(self, corpus, output_dir, log=utils.logger.null_logger()): super(AbstractRecipe, self).__init__(log=log) self.njobs = utils.default_njobs() self.corpus = corpus self.meta.source = 'corpus = {}'.format(self.corpus.meta.source) self.meta.name = self.name + ' on corpus ' + self.corpus.meta.name if not os.path.isdir(output_dir): os.makedirs(output_dir) self.output_dir = os.path.abspath(output_dir) # init the recipe dir as a subdirectory of output_dir self.recipe_dir = os.path.join(self.output_dir, 'recipe') if not os.path.isdir(self.recipe_dir): os.makedirs(self.recipe_dir) # if True, delete the recipe_dir on instance destruction self.delete_recipe = True # init the abkhazia2kaldi converter self.a2k = Abkhazia2Kaldi(self.corpus, self.recipe_dir, name=self.name, log=self.log)
def __init__(self, corpus, njobs=default_njobs(), log=logger.null_logger()): self.corpus = corpus self.njobs = njobs self.log = log