def test_logger(capsys, level): log = utils.get_logger(level=level) log.debug('DEBUG') log.info('INFO') log.warning('WARNING') log.error('ERROR') captured = capsys.readouterr() assert not captured.out if level is logging.ERROR: assert 'ERROR' in captured.err assert 'WARNING' not in captured.err assert 'INFO' not in captured.err assert 'DEBUG' not in captured.err if level is logging.WARNING: assert 'ERROR' in captured.err assert 'WARNING' in captured.err assert 'INFO' not in captured.err assert 'DEBUG' not in captured.err if level is logging.INFO: assert 'ERROR' in captured.err assert 'WARNING' in captured.err assert 'INFO' in captured.err assert 'DEBUG' not in captured.err if level is logging.DEBUG: assert 'ERROR' in captured.err assert 'WARNING' in captured.err assert 'INFO' in captured.err assert 'DEBUG' in captured.err
def test_njobs(capsys, njobs, audio): get_logger().setLevel(0) signals = {'1': audio} p = MfccProcessor(sample_rate=audio.sample_rate) if njobs == 0: with pytest.raises(ValueError) as err: p.process_all(signals, njobs=njobs) assert 'must be strictly positive' in str(err) return features = p.process_all(signals, njobs=njobs) if njobs > multiprocessing.cpu_count(): assert 'CPU cores but reducing to' in capsys.readouterr().err assert signals.keys() == features.keys()
def test_process(capsys, audio, mfcc, weights): get_logger(level='debug') proc = BottleneckProcessor(weights=weights) feat = proc.process(audio) assert feat.shape == (140, 80) assert feat.shape[1] == proc.ndims assert np.allclose(feat.times, mfcc.times) assert proc.frame_length == 0.025 assert proc.frame_shift == 0.01 assert proc.sample_rate == 8000 # check the log messages captured = capsys.readouterr().err assert 'resampling audio from 16000Hz@16b to 8000Hz@16b' in captured assert '{} frames of speech detected (on 140 total frames)'.format( '118' if audio._sox_binary else '121') in captured
def test_concatenate_tolerance(capsys): get_logger(level='info') f1 = Features(np.random.random((12, 2)), np.ones((12, ))) f2 = Features(np.random.random((10, 2)), np.ones((10, ))) with pytest.raises(ValueError) as err: f1.concatenate(f2, tolerance=0) assert 'features have a different number of frames' in str(err) with pytest.raises(ValueError) as err: f1.concatenate(f2, tolerance=1) assert 'features differs number of frames, and greater than ' in str(err) f3 = f1.concatenate(f2, tolerance=2) assert f3.shape == (10, 4) assert 'WARNING' in capsys.readouterr().err f3 = f2.concatenate(f1, tolerance=2) assert f3.shape == (10, 4) assert 'WARNING' in capsys.readouterr().err
def _extract_pass_one(utt_name, manager, log=get_logger()): # load audio signal of the utterance log.debug('%s: load audio', utt_name) audio = manager.get_audio(utt_name) # main features extraction log.debug('%s: extract %s', utt_name, manager.features) features = manager.get_features_processor(utt_name).process(audio) # cmvn accumulation if 'cmvn' in manager.config: log.debug('%s: accumulate cmvn', utt_name) # weight CMVN by voice activity detection (null weights on # non-voiced frames) if manager.config['cmvn']['with_vad']: energy = manager.get_energy_processor(utt_name).process(audio) vad = manager.get_vad_processor(utt_name).process(energy) vad = vad.data.reshape((vad.shape[0], )) # reshape as 1d array else: vad = None manager.get_cmvn_processor(utt_name).accumulate(features, weights=vad) # pitch extraction if 'pitch' in manager.config: log.debug('%s: extract pitch', utt_name) p1 = manager.get_pitch_processor(utt_name) p2 = manager.get_pitch_post_processor(utt_name) pitch = p2.process(p1.process(audio)) else: pitch = None # add info on speaker and audio input on the features properties speaker = manager.utterances[utt_name].speaker if speaker: features.properties['speaker'] = speaker utterance = manager.utterances[utt_name] features.properties['audio'] = { 'file': os.path.abspath(utterance.file), 'sample_rate': manager._wavs_metadata[utterance.file].sample_rate} if utterance.tstart is not None: features.properties['audio']['tstart'] = utterance.tstart features.properties['audio']['tstop'] = utterance.tstop features.properties['audio']['duration'] = min( utterance.tstop - utterance.tstart, manager._wavs_metadata[utterance.file].duration - utterance.tstart) else: features.properties['audio']['duration'] = ( manager._wavs_metadata[utterance.file].duration) return utt_name, features, pitch
def _check_environment(njobs, log=get_logger()): if njobs == 1: return try: nthreads = int(os.environ['OMP_NUM_THREADS']) except KeyError: nthreads = None if not nthreads or nthreads > 1: log.warning( 'working on %s threads but implicit parallelism is active, ' 'this may slow down the processing. Set the environment variable ' 'OMP_NUM_THREADS=1 to disable this warning', njobs)
def command_extract(args): # setup the logger (level given by -q/-v arguments) if args.quiet: log = utils.null_logger() else: if args.verbose == 0: level = 'warning' elif args.verbose == 1: level = 'info' else: # verbose >= 2 level = 'debug' log = utils.get_logger(name='speech-features', level=level) # forward the initialized log to shennong utils._logger = log # make sure the output file is not already existing and have a # valid extension output_file = args.output_file if os.path.exists(output_file): log.error('output file already exist: %s', output_file) return output_ext = os.path.splitext(output_file)[1] if output_ext not in supported_extensions().keys(): log.error( 'output file has an unsupported extension "%s", must be in %s', output_ext, ", ".join(supported_extensions().keys())) return # make sure the input config and wavs_index exists for filename in (args.config, args.utts_index): if not os.path.exists(filename): log.error('input file not found: %s', filename) # read the utterances file as a list of lists, ignore empty lines # in the file utterances = [ utt.split(' ') for utt in (utt.strip() for utt in open(args.utts_index, 'r')) if utt] # run the pipeline features = pipeline.extract_features( args.config, utterances, njobs=args.njobs, log=log) # save the features log.info('saving the features to %s', output_file) features.save(output_file)
def test_config_format(utterances_index, capsys, tmpdir, kind): config = pipeline.get_default_config('mfcc', to_yaml=kind != 'dict') if kind == 'file': tempfile = str(tmpdir.join('foo')) open(tempfile, 'w').write(config) config = tempfile if kind == 'str': config2 = 'a:\nb\n' with pytest.raises(ValueError) as err: pipeline._init_config(config2) assert 'error in configuration' in str(err) parsed = pipeline._init_config(config, log=utils.get_logger(level='info')) output = capsys.readouterr().err for word in ('mfcc', 'pitch', 'cmvn', 'delta'): assert word in output assert word in parsed
def __init__(self, config, utterances, log=get_logger()): self._config = config self._utterances = utterances self.log = log # the list of speakers self._speakers = set(u.speaker for u in self.utterances.values()) if self._speakers == {None}: self._speakers = None self._check_speakers() # store the metadata because we need to access the sample rate # for processors instanciation wavs = set(u.file for u in utterances.values()) self._wavs_metadata = {w: Audio.scan(w) for w in wavs} # make sure all the wavs are compatible with the pipeline log.info(f'scanning {len(self._utterances)} utterances...') self._check_wavs() # the features type to be extracted self.features = [ k for k in self.config.keys() if k in self._valid_features][0] # get some framing parameters constant for all processors # (retrieve them from a features processor instance) p = self.get_features_processor(next(iter(self.utterances.keys()))) self.frame_length = p.frame_length self.frame_shift = p.frame_shift # if CMVN by speaker, instanciate a CMVN processor by speaker # here, else instanciate a processor per utterance if 'cmvn' in self.config: if self.config['cmvn']['by_speaker']: self._cmvn_processors = { spk: self.get_processor_class('cmvn')(p.ndims) for spk in self.speakers} else: self._cmvn_processors = { utt: self.get_processor_class('cmvn')(p.ndims) for utt in self.utterances}
def main(): # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('data_directory', help='input/output data directory') parser.add_argument('config_file', help='YAML configuration file') parser.add_argument( 'corpus', choices=['english', 'xitsonga'], help='corpus to process') parser.add_argument( '-j', '--njobs', type=int, default=4, metavar='<int>', help='number of parallel jobs (default to %(default)s)') parser.add_argument( '-v', '--verbose', action='store_true', help='increase log level') args = parser.parse_args() # check and setup arguments data_directory = args.data_directory if not os.path.isdir(data_directory): raise ValueError(f'directory not found: {data_directory}') config = args.config_file if not os.path.isfile(config): raise ValueError(f'file not found: {config}') try: os.makedirs(os.path.join(data_directory, 'features')) except FileExistsError: pass log = get_logger(level='debug' if args.verbose else 'info') # load input utterances utterances = [line.split(' ') for line in open(os.path.join( data_directory, f'{args.corpus}.utts'), 'r')] # extract the features features = pipeline.extract_features( config, utterances, njobs=args.njobs, log=log) # save them h5f_file = os.path.join( data_directory, 'features', f'{args.corpus}_{os.path.basename(config)}' .replace('.yaml', '.h5f')) features.save(h5f_file)
def test_check_speakers(utterances_index, capsys): log = utils.get_logger(level='info') config = pipeline.get_default_config('mfcc') with pytest.raises(ValueError) as err: pipeline.extract_features(config, [(utterances_index[0][1], )], log=log) assert 'no speaker information provided' in str(err) capsys.readouterr() # clean the buffer config = pipeline.get_default_config('mfcc', with_cmvn=False) pipeline.extract_features(config, utterances_index, log=log) log_out = capsys.readouterr() assert 'cmvn' not in log_out.err assert '(CMVN disabled)' in log_out.err config = pipeline.get_default_config('mfcc', with_cmvn=True) config['cmvn']['by_speaker'] = False pipeline.extract_features(config, utterances_index, log=log) log_out = capsys.readouterr().err assert 'cmvn by utterance' in log_out assert '(CMVN by speaker disabled)' in log_out
def _extract_pass_two(utt_name, manager, features, pitch, tolerance=2, log=get_logger()): # apply cmvn if 'cmvn' in manager.config: log.debug('%s: apply cmvn', utt_name) features = manager.get_cmvn_processor(utt_name).process(features) # apply delta if 'delta' in manager.config: log.debug('%s: apply delta', utt_name) features = manager.get_delta_processor(utt_name).process(features) # concatenate the pitch features to the main ones. because of # downsampling in pitch processing the resulting number of frames # can differ (the same tolerance is applied in Kaldi, see # the paste-feats binary) if pitch: log.debug('%s: concatenate pitch', utt_name) features._log = log features = features.concatenate(pitch, tolerance=tolerance) return utt_name, features
def _extract_features(config, utterances, njobs=1, log=get_logger()): # the manager will instanciate the pipeline components manager = _Manager(config, utterances, log=log) # verbosity level for joblib (no joblib verbosity on debug level # (level <= 10) because each step is already detailed in inner # loops verbose = 8 if log.getEffectiveLevel() > 10 else 0 # cmvn : two passes. 1st with features pitch and cmvn # accumulation, 2nd with cmvn application and delta if 'cmvn' in config: # extract features and pitch, accumulate cmvn stats pass_one = _Parallel( 'features extraction, pass 1', log, n_jobs=njobs, verbose=verbose, prefer='threads')( joblib.delayed(_extract_pass_one)( utterance, manager, log=log) for utterance in utterances) # apply cmvn and extract deltas features = FeaturesCollection(**{k: v for k, v in _Parallel( 'features extraction, pass 2', log, n_jobs=njobs, verbose=verbose, prefer='threads')( joblib.delayed(_extract_pass_two)( utterance, manager, features, pitch, log=log) for utterance, features, pitch in pass_one)}) # no cmvn: single pass else: features = FeaturesCollection(**{k: v for k, v in _Parallel( 'features extraction', log, n_jobs=njobs, verbose=verbose, prefer='threads')( joblib.delayed(_extract_single_pass)( utterance, manager, log=log) for utterance in utterances)}) return features
def test_extract_features_full(ext, wav_file, wav_file_8k, wav_file_float32, capsys, tmpdir): # difficult case with parallel jobs, different sampling rates, # speakers and segments index = [('u1', wav_file, 's1', 0, 1), ('u2', wav_file_float32, 's2', 1, 1.2), ('u3', wav_file_8k, 's1', 1, 3)] config = pipeline.get_default_config('mfcc') # disable VAD because it can alter the cmvn result (far from (0, # 1) when the signal includes non-voiced frames) config['cmvn']['with_vad'] = False feats = pipeline.extract_features(config, index, njobs=2, log=utils.get_logger()) # ensure we have the expected log messages messages = capsys.readouterr().err assert 'INFO - get 3 utterances from 2 speakers in 3 wavs' in messages assert 'WARNING - several sample rates found in wav files' in messages for utt in ('u1', 'u2', 'u3'): assert utt in feats assert feats[utt].dtype == np.float32 # check properies p1 = feats['u1'].properties p2 = feats['u2'].properties p3 = feats['u3'].properties assert p1['audio']['file'] == wav_file assert p1['audio']['duration'] == 1.0 assert p2['audio']['file'] == wav_file_float32 assert p2['audio']['duration'] == pytest.approx(0.2) assert p3['audio']['file'] == wav_file_8k assert p3['audio']['duration'] < 0.5 # ask 3s but get duration-tstart assert p1['mfcc'] == p2['mfcc'] assert p1['mfcc']['sample_rate'] != p3['mfcc']['sample_rate'] assert p1.keys() == { 'audio', 'mfcc', 'cmvn', 'pitch', 'delta', 'speaker', 'pipeline' } assert p1.keys() == p2.keys() == p3.keys() assert p1['pipeline'] == p2['pipeline'] == p3['pipeline'] # check shape. mfcc*delta + pitch = 13 * 3 + 3 = 42 assert feats['u1'].shape == (98, 42) assert feats['u2'].shape == (18, 42) assert feats['u3'].shape == (40, 42) # check cmvn assert feats['u2'].data[:, :13].mean() == pytest.approx(0.0, abs=1e-6) assert feats['u2'].data[:, :13].std() == pytest.approx(1.0, abs=1e-6) data = np.vstack((feats['u1'].data[:, :13], feats['u3'].data[:, :13])) assert data.mean() == pytest.approx(0.0, abs=1e-6) assert data.std() == pytest.approx(1.0, abs=1e-6) assert np.abs(data.mean()) <= np.abs(feats['u1'].data[:, :13].mean()) assert np.abs(data.std() - 1.0) <= np.abs(feats['u1'].data[:, :13].std() - 1.0) assert np.abs(data.mean()) <= np.abs(feats['u3'].data[:, :13].mean()) assert np.abs(data.std() - 1.0) <= np.abs(feats['u3'].data[:, :13].std() - 1.0) # save / load the features filename = str(tmpdir.join('feats' + ext)) feats.save(filename) feats2 = FeaturesCollection.load(filename) assert feats2 == feats
def test_check_environment(capsys): if 'OMP_NUM_THREADS' in os.environ: del os.environ['OMP_NUM_THREADS'] pipeline._check_environment(2, log=utils.get_logger()) out = capsys.readouterr().err assert 'working on 2 threads but implicit parallelism is active' in out
def _init_config(config, log=get_logger()): try: if os.path.isfile(config): log.debug('loading configuration from %s', config) config = open(config, 'r').read() except TypeError: pass if isinstance(config, str): # the config is a string, try to load it as a YAML try: config = yaml.load(config, Loader=yaml.FullLoader) except yaml.YAMLError as err: raise ValueError('error in configuration: {}', str(err)) # ensure all the keys in config are known unknown_keys = [ k for k in config.keys() if k not in _Manager._valid_processors] if unknown_keys: raise ValueError( 'invalid keys in configuration: {}'.format( ', '.join(unknown_keys))) # ensure one and only one features processor is defined in the # configuration features = [k for k in config.keys() if k in valid_features()] if not features: raise ValueError( 'the configuration does not define any features extraction, ' 'only post-processing (must have one and only one entry of {})' .format(', '.join(valid_features()))) if len(features) > 1: raise ValueError( 'more than one features extraction processors are defined, ' '(must have one and only one entry of {}): {}' .format(', '.join(valid_features()), ', '.join(features))) if 'cmvn' in config: # force by_speaker to False if not existing if 'by_speaker' not in config['cmvn']: log.warning( 'by_speaker option not specified for cmvn, ' 'assuming it is false and doing cmvn by utterance') config['cmvn']['by_speaker'] = False # force with_vad to True if not existing if 'with_vad' not in config['cmvn']: config['cmvn']['with_vad'] = True # if pitch, make sure we have a 'postprocessing' entry if 'pitch' in config and 'postprocessing' not in config['pitch']: config['pitch']['postprocessing'] = {} # log message describing the pipeline configuration msg = [] if 'pitch' in config: msg.append('pitch') if 'delta' in config: msg.append('delta') if 'cmvn' in config: by = 'speaker' if config['cmvn']['by_speaker'] else 'utterance' vad = ' with vad' if config['cmvn']['with_vad'] else '' msg.append('cmvn by {}{}'.format(by, vad)) log.info( 'pipeline configured for %s features extraction%s', features[0], ' with {}'.format(', '.join(msg)) if msg else '') return config
def _extract_single_pass(utt_name, manager, log=get_logger()): _, features, pitch = _extract_pass_one(utt_name, manager, log=log) return _extract_pass_two(utt_name, manager, features, pitch, log=log)
def _init_utterances(utts_index, log=get_logger()): """Returns a dict {utt_id: (wav_file, speaker_id, tstart, tstop)} Raises on any error, log a warning on strange but non-critical issues. """ # guess the for format of `wavs` and ensure it is homogeneous utts = list((u,) if isinstance(u, str) else u for u in utts_index) index_format = set(len(u) for u in utts) if not len(index_format) == 1: raise ValueError( 'the wavs index is not homogeneous, entries have different ' 'lengths: {}'.format(', '.join(str(t) for t in index_format))) index_format = list(index_format)[0] # ensure the utterances index format is valid valid_formats = { 1: '<wav-file>', 2: '<utterance-id> <wav-file>', 3: '<utterance-id> <wav-file> <speaker-id>', 4: '<utterance-id> <wav-file> <tstart> <tstop>', 5: '<utterance-id> <wav-file> <speaker-id> <tstart> <tstop>'} try: log.info( 'detected format for utterances index is: %s', valid_formats[index_format]) except KeyError: raise ValueError('unknown format for utterances index') # ensure 1st column has unique elements duplicates = [u for u, c in collections.Counter( u[0] for u in utts).items() if c > 1] if duplicates: raise ValueError( 'duplicates found in utterances index: {}'.format( ', '.join(duplicates))) # sort the utterances by wav_file (and then by utt_id), this # is a minor optimization to use the cache system of Audio.load(), # ie this avoids to reload several times the same wav when using # tstart/tstop segments. utts = sorted(utts, key=lambda u: u if index_format == 1 else (u[1], u[0])) # build the utterances collection as a dict # {utt_id: (wav_file, speaker_id, tstart, tstop)} utterances = {} for n, utt in enumerate(utts, start=1): if index_format == 1: utt_id = 'utt_{}'.format(str(n)) wav_file = utt[0] else: utt_id = utt[0] wav_file = utt[1] utterances[utt_id] = _Utterance( file=wav_file, speaker=utt[2] if index_format in (3, 5) else None, tstart=(float(utt[2]) if index_format == 4 else float(utt[3]) if index_format == 5 else None), tstop=(float(utt[3]) if index_format == 4 else float(utt[4]) if index_format == 5 else None)) # ensure all the wavs are here wavs = [w.file for w in utterances.values()] not_found = [w for w in wavs if not os.path.isfile(w)] if not_found: raise ValueError( 'the following wav files are not found: {}' .format(', '.join(not_found))) return utterances
def test_logger_bad_level(): with pytest.raises(ValueError) as err: utils.get_logger(level='bad') assert 'invalid logging level' in str(err.value)
'https://raw.githubusercontent.com/bootphon/ABXpy/' 'zerospeech2015/resources/english.item') XITSONGA_ITEM = ( 'https://raw.githubusercontent.com/bootphon/ABXpy/' 'zerospeech2015/resources/xitsonga.item') ENGLISH_FILES_LIST = ( 'https://raw.githubusercontent.com/bootphon/' 'Zerospeech2015/master/english_files.txt') XITSONGA_FILES_LIST = ( 'https://raw.githubusercontent.com/bootphon/' 'Zerospeech2015/master/xitsonga_files.txt') log = get_logger(level='info') def setup_data(data_directory, buckeye_directory, xitsonga_directory): """Setup a data directory with all input data required * creates the ``data_directory`` * make a symlink to ``buckeye_directory`` and ``xitsonga_directory`` in it * download the ABX item files for buckeye and xitsonga * create the list of utterances both corpora * create the configuration files for features extraction """ # basic checks if not os.path.isdir(buckeye_directory): raise ValueError(f'directory does not exists: {buckeye_directory}')
class FeaturesSerializer(metaclass=abc.ABCMeta): """Base class of a features file serializer This class must be specialized to handle a given file type. Parameters ---------- cls : class Must be :class:`shennong.features.FeaturesCollection`, this is a tweak to avoid circular imports filename : str The file to save/load features to/from """ _log = get_logger(__name__) def __init__(self, cls, filename): self._features_collection = cls self._features = self._features_collection._value_type self._filename = filename @property def filename(self): return self._filename @abc.abstractmethod def _save(self, features): # pragma: nocover pass @abc.abstractmethod def _load(self): # pragma: nocover pass def load(self, **kwargs): """Returns a collection of features from the `filename` Returns ------- features : :class:`~shennong.features.FeaturesCollection` The features stored in the file. kwargs : optional Optional supplementary arguments, specific to each serializer. Raises ------ IOError If the input file does not exist or cannot be read. ValueError If the features cannot be loaded from the file or are not in a valid state. """ if not os.path.isfile(self.filename): raise IOError('file not found: {}'.format(self.filename)) if not os.access(self.filename, os.R_OK): raise IOError('file not readable: {}'.format(self.filename)) features = self._load(**kwargs) if not features.is_valid(): raise ValueError('features not valid in file: {}'.format( self.filename)) return features def save(self, features, **kwargs): """Saves a collection of `features` to a file Parameters ---------- features : :class:`~shennong.features.FeaturesCollection` The features to store in the file. kwargs : optional Optional supplementary arguments, specific to each serializer. Raises ------ IOError If the output file already exists. ValueError If the features cannot be saved to the file, are not in a valid state or are not an instance of :class:`~shennong.features.FeaturesCollection`. """ if os.path.isfile(self.filename): raise IOError('file already exists: {}'.format(self.filename)) if not isinstance(features, self._features_collection): raise ValueError('features must be {} but are {}'.format( self._features_collection.__name__, features.__class__.__name__)) if not features.is_valid(): raise ValueError('features are not valid') self._save(features, **kwargs)
def extract_features(configuration, utterances_index, njobs=1, log=get_logger()): """Speech features extraction pipeline Given a pipeline ``configuration`` and an ``utterances_index`` defining a list of utterances on which to extract features, this function applies the whole pipeline and returns the extracted features as an instance of :class:`~shennong.features.features.FeaturesCollection`. It uses ``njobs`` parallel subprocesses. The utterances in the ``utterances_index`` can be defined in one of the following format (the format must be homogoneous across the index, i.e. only one format can be used): * 1-uple (or str): ``<wav-file>`` * 2-uple: ``<utterance-id> <wav-file>`` * 3-uple: ``<utterance-id> <wav-file> <speaker-id>`` * 4-uple: ``<utterance-id> <wav-file> <tstart> <tstop>`` * 5-uple: ``<utterance-id> <wav-file> <speaker-id> <tstart> <tstop>`` Parameters ---------- config : dict or str The pipeline configuration, can be a dictionary, a path to a YAML file or a string formatted in YAML. To get a configuration example, see :func:`get_default_config` utterances_index : sequence of tuples The list of utterances to extract the features on. njobs : int, optional The number to subprocesses to execute in parallel, use a single process by default. log : logging.Logger A logger to display messages during pipeline execution Returns ------- features : :class:`~shennong.features.features.FeaturesCollection` The extracted speech features Raises ------ ValueError If the ``configuration`` or the ``utterances_index`` are invalid, or if something goes wrong during features extraction. """ # intialize the pipeline configuration, the list of wav files to # process, instanciate the pipeline processors and make all the # checks to ensure all is correct njobs = get_njobs(njobs, log=log) config = _init_config(configuration, log=log) utterances = _init_utterances(utterances_index, log=log) # check the OMP_NUM_THREADS variable for parallel computations _check_environment(njobs, log=log) # do all the computations return _extract_features( config, utterances, njobs=njobs, log=log)