def test_kaldiserializer_badfile(tmpdir, mfcc_col, missing): filename = str(tmpdir.join('foo.ark')) mfcc_col.save(filename) os.remove(str(tmpdir.join(missing))) with pytest.raises(IOError) as err: FeaturesCollection.load(filename) assert 'file not found: {}'.format(str(tmpdir.join(missing))) in str(err)
def mfcc_utf8(mfcc): props = mfcc.properties props['comments'] = '使用人口について正確な統計はないが、日本国' feats = FeaturesCollection() feats['æðÐ'] = Features(mfcc.data, mfcc.times, props) return feats
def test_save_invalid(tmpdir, mfcc): f = str(tmpdir.join('foo.json')) h = serializers.get_serializer(FeaturesCollection, f, None) feats = FeaturesCollection( mfcc=Features(data=mfcc.data, times=0, validate=False)) with pytest.raises(ValueError) as err: h.save(feats) assert 'features are not valid' in str(err.value)
def test_apply_baddim(features_collection): feats = FeaturesCollection( {k: v.copy() for k, v in features_collection.items()}) feats['new'] = Features( np.random.random((2, 1)), np.asarray([0, 1])) with pytest.raises(ValueError) as err: apply_cmvn(feats) assert 'must have consistent dimensions' in str(err.value)
def features_collection(): # build a collection of 3 random features of same ndims, various # nframes dim = 10 feats = FeaturesCollection() for n in range(3): nframes = np.random.randint(5, 15) feats[str(n)] = Features(np.random.random((nframes, dim)), np.arange(0, nframes)) return feats
def test_collection_isclose(): f1 = Features(np.random.random((10, 2)), np.ones((10, ))) f2 = Features(np.random.random((10, 2)), np.ones((10, ))) fc1 = FeaturesCollection(f1=f1, f2=f2) fc2 = FeaturesCollection(f1=f1, f2=Features(f2.data + 1, f2.times)) fc3 = FeaturesCollection(f1=f1, f3=f2) assert fc1.is_close(fc1) assert not fc1.is_close(fc2) assert fc1.is_close(fc2, atol=1) assert not fc1.is_close(fc3)
def test_kaldiserializer(mfcc_col, tmpdir, scp): mfcc_col.save(str(tmpdir.join('foo.ark')), scp=scp) assert os.path.isfile(str(tmpdir.join('foo.ark'))) assert os.path.isfile(str(tmpdir.join('foo.times.ark'))) assert os.path.isfile(str(tmpdir.join('foo.properties.json'))) if scp: assert os.path.isfile(str(tmpdir.join('foo.scp'))) assert os.path.isfile(str(tmpdir.join('foo.times.scp'))) mfcc_col2 = FeaturesCollection.load(str(tmpdir.join('foo.ark'))) assert mfcc_col2 == mfcc_col
def test_times_1d(serializer, tmpdir): filename = ('feats.ark' if serializer is serializers.KaldiSerializer else 'feats') tmpfile = str(tmpdir.join(filename)) p = MfccProcessor() times = p.times(10)[:, 1] assert times.shape == (10, ) col = FeaturesCollection(mfcc=Features(np.random.random((10, 5)), times)) serializer(col.__class__, tmpfile).save(col) col2 = serializer(col.__class__, tmpfile).load() assert col == col2
def _extract_features(config, utterances, njobs=1, log=get_logger()): # the manager will instanciate the pipeline components manager = _Manager(config, utterances, log=log) # verbosity level for joblib (no joblib verbosity on debug level # (level <= 10) because each step is already detailed in inner # loops verbose = 8 if log.getEffectiveLevel() > 10 else 0 # cmvn : two passes. 1st with features pitch and cmvn # accumulation, 2nd with cmvn application and delta if 'cmvn' in config: # extract features and pitch, accumulate cmvn stats pass_one = _Parallel( 'features extraction, pass 1', log, n_jobs=njobs, verbose=verbose, prefer='threads')( joblib.delayed(_extract_pass_one)( utterance, manager, log=log) for utterance in utterances) # apply cmvn and extract deltas features = FeaturesCollection(**{k: v for k, v in _Parallel( 'features extraction, pass 2', log, n_jobs=njobs, verbose=verbose, prefer='threads')( joblib.delayed(_extract_pass_two)( utterance, manager, features, pitch, log=log) for utterance, features, pitch in pass_one)}) # no cmvn: single pass else: features = FeaturesCollection(**{k: v for k, v in _Parallel( 'features extraction', log, n_jobs=njobs, verbose=verbose, prefer='threads')( joblib.delayed(_extract_single_pass)( utterance, manager, log=log) for utterance in utterances)}) return features
def test_kaldiserializer_baditems(tmpdir, mfcc_col): mfcc_col2 = FeaturesCollection(one=mfcc_col['mfcc'], two=mfcc_col['mfcc']) mfcc_col.save(str(tmpdir.join('one.ark'))) mfcc_col2.save(str(tmpdir.join('two.ark'))) os.remove(str(tmpdir.join('two.times.ark'))) shutil.copyfile(str(tmpdir.join('one.times.ark')), str(tmpdir.join('two.times.ark'))) with pytest.raises(ValueError) as err: FeaturesCollection.load(str(tmpdir.join('two.ark'))) assert 'items differ in data and times' in str(err.value) os.remove(str(tmpdir.join('one.properties.json'))) shutil.copyfile(str(tmpdir.join('two.properties.json')), str(tmpdir.join('one.properties.json'))) with pytest.raises(ValueError) as err: FeaturesCollection.load(str(tmpdir.join('one.ark'))) assert 'items differ in data and properties' in str(err.value)
def test_heterogeneous(mfcc, serializer, tmpdir): mfcc_col = FeaturesCollection(mfcc32=mfcc, mfcc64=mfcc.copy(dtype=np.float64)) filename = ('feats.ark' if serializer is serializers.KaldiSerializer else 'feats') h = serializer(mfcc_col.__class__, str(tmpdir.join(filename))) # h5features doesn't support heteregoneous data if serializer is serializers.H5featuresSerializer: with pytest.raises(IOError) as err: h.save(mfcc_col) assert 'data is not appendable to the group' in str(err.value) else: h.save(mfcc_col) mfcc2 = h.load() assert mfcc2 == mfcc_col
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('data_dir', help='input directory with wavs') parser.add_argument( 'output_dir', default='/tmp', nargs='?', help='output directory (created files are deleted at exit)') args = parser.parse_args() # load audio data and compute total duration audio_data = { os.path.basename(f): Audio.load(f) for f in list_files_with_extension(args.data_dir, '.wav') } total_duration = datetime.timedelta( seconds=int(sum(a.duration for a in audio_data.values()))) print('found {} wav files, total duration of {}'.format( len(audio_data), str(total_duration))) # compute the features (default MFCC) print('computing MFCC features...') t1 = datetime.datetime.now() processor = MfccProcessor() features = FeaturesCollection( **{k: processor.process(v) for k, v in audio_data.items()}) t2 = datetime.datetime.now() print('took {}'.format(t2 - t1)) # save the features in all the supported formats data = { 'duration': total_duration, 'data': { ext: analyze_serializer(features, ext, args.output_dir) for ext in supported_extensions().keys() } } print_results(data)
def process_all(self, signals, njobs=None): """Returns features processed from several input `signals` This function processes the features in parallel jobs. Parameters ---------- signals: dict of :class`~shennong.audio.Audio` A dictionnary of input audio signals to process features on, where the keys are item names and values are audio signals. njobs: int, optional The number of parallel jobs to run in background. Default to the number of CPU cores available on the machine. Returns ------- features: :class:`~shennong.features.features.FeaturesCollection` The computed features on each input signal. The keys of output `features` are the keys of the input `signals`. Raises ------ ValueError If the `njobs` parameter is <= 0 """ # checks the number of background jobs njobs = get_njobs(njobs, log=self._log) def _process_one(name, signal): return name, self.process(signal) return FeaturesCollection( **{ k: v for k, v in joblib.Parallel( n_jobs=njobs, verbose=0, backend='threading')( joblib.delayed(_process_one)(name, signal) for name, signal in signals.items()) })
def analyze_serializer(features, ext, output_dir): with tempfile.TemporaryDirectory(dir=output_dir) as tmpdir: filename = os.path.join(tmpdir, 'features' + ext) print('writing {}...'.format(filename)) t1 = datetime.datetime.now() features.save(filename) t2 = datetime.datetime.now() t_write = t2 - t1 print('took {}'.format(t_write)) f_size = os.path.getsize(filename) print('filesize: {}'.format(sizeof_fmt(f_size))) print('reading {}...'.format(filename)) t1 = datetime.datetime.now() features2 = FeaturesCollection.load(filename) t2 = datetime.datetime.now() t_read = t2 - t1 print('took {}'.format(t_read)) print('rw equality: {}'.format(features2 == features)) return (t_write, t_read, f_size)
def test_partition(): f1 = Features(np.random.random((10, 2)), np.ones((10, ))) f2 = Features(np.random.random((5, 2)), np.ones((5, ))) f3 = Features(np.random.random((5, 2)), np.ones((5, ))) fc = FeaturesCollection(f1=f1, f2=f2, f3=f3) with pytest.raises(ValueError) as err: fp = fc.partition({'f1': 'p1', 'f2': 'p1'}) assert ('following items are not defined in the partition index: f3' in str(err)) fp = fc.partition({'f1': 'p1', 'f2': 'p1', 'f3': 'p2'}) assert sorted(fp.keys()) == ['p1', 'p2'] assert sorted(fp['p1'].keys()) == ['f1', 'f2'] assert sorted(fp['p2'].keys()) == ['f3'] assert fc.is_valid() for fc in fp.values(): assert fc.is_valid()
def test_collection(mfcc): assert FeaturesCollection._value_type is Features assert FeaturesCollection().is_valid() assert FeaturesCollection(mfcc=mfcc).is_valid() assert not FeaturesCollection( mfcc=Features(np.asarray([0]), 0, validate=False)).is_valid()
def test_extract_features_full(ext, wav_file, wav_file_8k, wav_file_float32, capsys, tmpdir): # difficult case with parallel jobs, different sampling rates, # speakers and segments index = [('u1', wav_file, 's1', 0, 1), ('u2', wav_file_float32, 's2', 1, 1.2), ('u3', wav_file_8k, 's1', 1, 3)] config = pipeline.get_default_config('mfcc') # disable VAD because it can alter the cmvn result (far from (0, # 1) when the signal includes non-voiced frames) config['cmvn']['with_vad'] = False feats = pipeline.extract_features(config, index, njobs=2, log=utils.get_logger()) # ensure we have the expected log messages messages = capsys.readouterr().err assert 'INFO - get 3 utterances from 2 speakers in 3 wavs' in messages assert 'WARNING - several sample rates found in wav files' in messages for utt in ('u1', 'u2', 'u3'): assert utt in feats assert feats[utt].dtype == np.float32 # check properies p1 = feats['u1'].properties p2 = feats['u2'].properties p3 = feats['u3'].properties assert p1['audio']['file'] == wav_file assert p1['audio']['duration'] == 1.0 assert p2['audio']['file'] == wav_file_float32 assert p2['audio']['duration'] == pytest.approx(0.2) assert p3['audio']['file'] == wav_file_8k assert p3['audio']['duration'] < 0.5 # ask 3s but get duration-tstart assert p1['mfcc'] == p2['mfcc'] assert p1['mfcc']['sample_rate'] != p3['mfcc']['sample_rate'] assert p1.keys() == { 'audio', 'mfcc', 'cmvn', 'pitch', 'delta', 'speaker', 'pipeline' } assert p1.keys() == p2.keys() == p3.keys() assert p1['pipeline'] == p2['pipeline'] == p3['pipeline'] # check shape. mfcc*delta + pitch = 13 * 3 + 3 = 42 assert feats['u1'].shape == (98, 42) assert feats['u2'].shape == (18, 42) assert feats['u3'].shape == (40, 42) # check cmvn assert feats['u2'].data[:, :13].mean() == pytest.approx(0.0, abs=1e-6) assert feats['u2'].data[:, :13].std() == pytest.approx(1.0, abs=1e-6) data = np.vstack((feats['u1'].data[:, :13], feats['u3'].data[:, :13])) assert data.mean() == pytest.approx(0.0, abs=1e-6) assert data.std() == pytest.approx(1.0, abs=1e-6) assert np.abs(data.mean()) <= np.abs(feats['u1'].data[:, :13].mean()) assert np.abs(data.std() - 1.0) <= np.abs(feats['u1'].data[:, :13].std() - 1.0) assert np.abs(data.mean()) <= np.abs(feats['u3'].data[:, :13].mean()) assert np.abs(data.std() - 1.0) <= np.abs(feats['u3'].data[:, :13].std() - 1.0) # save / load the features filename = str(tmpdir.join('feats' + ext)) feats.save(filename) feats2 = FeaturesCollection.load(filename) assert feats2 == feats
def mfcc_col(mfcc): return FeaturesCollection(mfcc=mfcc)
def apply_cmvn(feats_collection, by_collection=True, norm_vars=True, weights=None, skip_dims=None): """CMVN normalization of a collection of features This function is a simple wrapper on the class :class:`~shennong.features.CmvnPostProcessor` that allows to accumulate and apply CMVN statistics over a whole collections of features. Warnings -------- The features in the collection must have the same dimensionality. It is assumed they are all extracted from the same processor. If this is not the case, a ValueError is raised. Parameters ---------- feats_collection : :class:`~shennong.FeaturesCollection` The collection of features on wich to apply CMVN normlization. Each features in the collection is assumed to have consistent dimensions. by_collection : bool, optional When True, accumulate and apply CMVN over the entire collection. When False, do it independently for each features in the collection. Default to True. norm_vars : bool, optional If False, do not apply variance normalization (only mean), default to True. weights : dict of arrays, optional For each features in the collection, an array of weights to apply on the features frames, if specified we must have ``weights.keys() == feats_collections.keys()`` (see :func:`CmvnPostProcessor.accumulate`). Unweighted by default. skip_dims : list of integers The dimensions for which to skip the normalization (see :func:`CmvnPostProcessor.process`). Default is to normalize all dimensions. Returns ------- cmvn_feats_collection : :class:`~shennong.features.FeaturesCollection` Raises ------ ValueError If something goes wrong during CMVN processing. """ # extract the features dimension dim = set(f.ndims for f in feats_collection.values()) if not len(dim) == 1: raise ValueError( 'features in the collection must have consistent dimensions ' 'but dimensions are: {}'.format(sorted(dim))) dim = list(dim)[0] # check weights if weights is not None and weights.keys() != feats_collection.keys(): raise ValueError('keys differ for weights and features collection') # check skip_dims if skip_dims is not None: sdmin, sdmax = min(skip_dims), max(skip_dims) if sdmin < 0 or sdmax >= dim: raise ValueError( 'out of bounds dimensions in skip_dims, must be in [0, {}] ' 'but are in [{}, {}]'.format(dim - 1, sdmin, sdmax)) if by_collection: # accumulate CMVN stats over the whole collection cmvn = CmvnPostProcessor(dim) for k, f in feats_collection.items(): w = weights[k] if weights is not None else None cmvn.accumulate(f, weights=w) # apply CMVN stats return FeaturesCollection({ k: cmvn.process(f, norm_vars=norm_vars, skip_dims=skip_dims) for k, f in feats_collection.items() }) else: # independently for each features in the collection, # accumulate and apply CMNV stats cmvn_collection = FeaturesCollection() for k, f in feats_collection.items(): cmvn = CmvnPostProcessor(f.ndims) cmvn.accumulate( f, weights=weights[k] if weights is not None else None) cmvn_collection[k] = cmvn.process(f, norm_vars=norm_vars, skip_dims=skip_dims) return cmvn_collection