Example #1
0
def test_kaldiserializer_badfile(tmpdir, mfcc_col, missing):
    filename = str(tmpdir.join('foo.ark'))
    mfcc_col.save(filename)
    os.remove(str(tmpdir.join(missing)))
    with pytest.raises(IOError) as err:
        FeaturesCollection.load(filename)
    assert 'file not found: {}'.format(str(tmpdir.join(missing))) in str(err)
Example #2
0
def mfcc_utf8(mfcc):
    props = mfcc.properties
    props['comments'] = '使用人口について正確な統計はないが、日本国'

    feats = FeaturesCollection()
    feats['æðÐ'] = Features(mfcc.data, mfcc.times, props)
    return feats
Example #3
0
def test_save_invalid(tmpdir, mfcc):
    f = str(tmpdir.join('foo.json'))
    h = serializers.get_serializer(FeaturesCollection, f, None)
    feats = FeaturesCollection(
        mfcc=Features(data=mfcc.data, times=0, validate=False))
    with pytest.raises(ValueError) as err:
        h.save(feats)
    assert 'features are not valid' in str(err.value)
Example #4
0
def test_apply_baddim(features_collection):
    feats = FeaturesCollection(
        {k: v.copy() for k, v in features_collection.items()})
    feats['new'] = Features(
        np.random.random((2, 1)), np.asarray([0, 1]))

    with pytest.raises(ValueError) as err:
        apply_cmvn(feats)
    assert 'must have consistent dimensions' in str(err.value)
Example #5
0
def features_collection():
    # build a collection of 3 random features of same ndims, various
    # nframes
    dim = 10
    feats = FeaturesCollection()
    for n in range(3):
        nframes = np.random.randint(5, 15)
        feats[str(n)] = Features(np.random.random((nframes, dim)),
                                 np.arange(0, nframes))
    return feats
Example #6
0
def test_collection_isclose():
    f1 = Features(np.random.random((10, 2)), np.ones((10, )))
    f2 = Features(np.random.random((10, 2)), np.ones((10, )))

    fc1 = FeaturesCollection(f1=f1, f2=f2)
    fc2 = FeaturesCollection(f1=f1, f2=Features(f2.data + 1, f2.times))
    fc3 = FeaturesCollection(f1=f1, f3=f2)

    assert fc1.is_close(fc1)
    assert not fc1.is_close(fc2)
    assert fc1.is_close(fc2, atol=1)
    assert not fc1.is_close(fc3)
Example #7
0
def test_kaldiserializer(mfcc_col, tmpdir, scp):
    mfcc_col.save(str(tmpdir.join('foo.ark')), scp=scp)
    assert os.path.isfile(str(tmpdir.join('foo.ark')))
    assert os.path.isfile(str(tmpdir.join('foo.times.ark')))
    assert os.path.isfile(str(tmpdir.join('foo.properties.json')))
    if scp:
        assert os.path.isfile(str(tmpdir.join('foo.scp')))
        assert os.path.isfile(str(tmpdir.join('foo.times.scp')))

    mfcc_col2 = FeaturesCollection.load(str(tmpdir.join('foo.ark')))

    assert mfcc_col2 == mfcc_col
Example #8
0
def test_times_1d(serializer, tmpdir):
    filename = ('feats.ark'
                if serializer is serializers.KaldiSerializer else 'feats')
    tmpfile = str(tmpdir.join(filename))

    p = MfccProcessor()
    times = p.times(10)[:, 1]
    assert times.shape == (10, )

    col = FeaturesCollection(mfcc=Features(np.random.random((10, 5)), times))

    serializer(col.__class__, tmpfile).save(col)
    col2 = serializer(col.__class__, tmpfile).load()
    assert col == col2
Example #9
0
def _extract_features(config, utterances, njobs=1, log=get_logger()):
    # the manager will instanciate the pipeline components
    manager = _Manager(config, utterances, log=log)

    # verbosity level for joblib (no joblib verbosity on debug level
    # (level <= 10) because each step is already detailed in inner
    # loops
    verbose = 8 if log.getEffectiveLevel() > 10 else 0

    # cmvn : two passes. 1st with features pitch and cmvn
    # accumulation, 2nd with cmvn application and delta
    if 'cmvn' in config:
        # extract features and pitch, accumulate cmvn stats
        pass_one = _Parallel(
            'features extraction, pass 1', log,
            n_jobs=njobs, verbose=verbose, prefer='threads')(
                joblib.delayed(_extract_pass_one)(
                    utterance, manager, log=log) for utterance in utterances)

        # apply cmvn and extract deltas
        features = FeaturesCollection(**{k: v for k, v in _Parallel(
            'features extraction, pass 2', log,
            n_jobs=njobs, verbose=verbose, prefer='threads')(
                joblib.delayed(_extract_pass_two)(
                    utterance, manager, features, pitch, log=log)
                for utterance, features, pitch in pass_one)})

    # no cmvn: single pass
    else:
        features = FeaturesCollection(**{k: v for k, v in _Parallel(
            'features extraction', log,
            n_jobs=njobs, verbose=verbose, prefer='threads')(
                joblib.delayed(_extract_single_pass)(
                    utterance, manager, log=log) for utterance in utterances)})

    return features
Example #10
0
def test_kaldiserializer_baditems(tmpdir, mfcc_col):
    mfcc_col2 = FeaturesCollection(one=mfcc_col['mfcc'], two=mfcc_col['mfcc'])
    mfcc_col.save(str(tmpdir.join('one.ark')))
    mfcc_col2.save(str(tmpdir.join('two.ark')))

    os.remove(str(tmpdir.join('two.times.ark')))
    shutil.copyfile(str(tmpdir.join('one.times.ark')),
                    str(tmpdir.join('two.times.ark')))
    with pytest.raises(ValueError) as err:
        FeaturesCollection.load(str(tmpdir.join('two.ark')))
    assert 'items differ in data and times' in str(err.value)

    os.remove(str(tmpdir.join('one.properties.json')))
    shutil.copyfile(str(tmpdir.join('two.properties.json')),
                    str(tmpdir.join('one.properties.json')))
    with pytest.raises(ValueError) as err:
        FeaturesCollection.load(str(tmpdir.join('one.ark')))
    assert 'items differ in data and properties' in str(err.value)
Example #11
0
def test_heterogeneous(mfcc, serializer, tmpdir):
    mfcc_col = FeaturesCollection(mfcc32=mfcc,
                                  mfcc64=mfcc.copy(dtype=np.float64))

    filename = ('feats.ark'
                if serializer is serializers.KaldiSerializer else 'feats')
    h = serializer(mfcc_col.__class__, str(tmpdir.join(filename)))

    # h5features doesn't support heteregoneous data
    if serializer is serializers.H5featuresSerializer:
        with pytest.raises(IOError) as err:
            h.save(mfcc_col)
        assert 'data is not appendable to the group' in str(err.value)
    else:
        h.save(mfcc_col)
        mfcc2 = h.load()
        assert mfcc2 == mfcc_col
Example #12
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('data_dir', help='input directory with wavs')
    parser.add_argument(
        'output_dir',
        default='/tmp',
        nargs='?',
        help='output directory (created files are deleted at exit)')

    args = parser.parse_args()

    # load audio data and compute total duration
    audio_data = {
        os.path.basename(f): Audio.load(f)
        for f in list_files_with_extension(args.data_dir, '.wav')
    }
    total_duration = datetime.timedelta(
        seconds=int(sum(a.duration for a in audio_data.values())))
    print('found {} wav files, total duration of {}'.format(
        len(audio_data), str(total_duration)))

    # compute the features (default MFCC)
    print('computing MFCC features...')
    t1 = datetime.datetime.now()
    processor = MfccProcessor()
    features = FeaturesCollection(
        **{k: processor.process(v)
           for k, v in audio_data.items()})
    t2 = datetime.datetime.now()
    print('took {}'.format(t2 - t1))

    # save the features in all the supported formats
    data = {
        'duration': total_duration,
        'data': {
            ext: analyze_serializer(features, ext, args.output_dir)
            for ext in supported_extensions().keys()
        }
    }

    print_results(data)
Example #13
0
    def process_all(self, signals, njobs=None):
        """Returns features processed from several input `signals`

        This function processes the features in parallel jobs.

        Parameters
        ----------
        signals: dict of :class`~shennong.audio.Audio`
            A dictionnary of input audio signals to process features
            on, where the keys are item names and values are audio
            signals.
        njobs: int, optional
            The number of parallel jobs to run in background. Default
            to the number of CPU cores available on the machine.

        Returns
        -------
        features: :class:`~shennong.features.features.FeaturesCollection`
            The computed features on each input signal. The keys of
            output `features` are the keys of the input `signals`.

        Raises
        ------
        ValueError
            If the `njobs` parameter is <= 0

        """
        # checks the number of background jobs
        njobs = get_njobs(njobs, log=self._log)

        def _process_one(name, signal):
            return name, self.process(signal)

        return FeaturesCollection(
            **{
                k: v
                for k, v in joblib.Parallel(
                    n_jobs=njobs, verbose=0, backend='threading')(
                        joblib.delayed(_process_one)(name, signal)
                        for name, signal in signals.items())
            })
Example #14
0
def analyze_serializer(features, ext, output_dir):
    with tempfile.TemporaryDirectory(dir=output_dir) as tmpdir:
        filename = os.path.join(tmpdir, 'features' + ext)

        print('writing {}...'.format(filename))
        t1 = datetime.datetime.now()
        features.save(filename)
        t2 = datetime.datetime.now()
        t_write = t2 - t1
        print('took {}'.format(t_write))

        f_size = os.path.getsize(filename)
        print('filesize: {}'.format(sizeof_fmt(f_size)))

        print('reading {}...'.format(filename))
        t1 = datetime.datetime.now()
        features2 = FeaturesCollection.load(filename)
        t2 = datetime.datetime.now()
        t_read = t2 - t1
        print('took {}'.format(t_read))
        print('rw equality: {}'.format(features2 == features))

        return (t_write, t_read, f_size)
Example #15
0
def test_partition():
    f1 = Features(np.random.random((10, 2)), np.ones((10, )))
    f2 = Features(np.random.random((5, 2)), np.ones((5, )))
    f3 = Features(np.random.random((5, 2)), np.ones((5, )))
    fc = FeaturesCollection(f1=f1, f2=f2, f3=f3)

    with pytest.raises(ValueError) as err:
        fp = fc.partition({'f1': 'p1', 'f2': 'p1'})
    assert ('following items are not defined in the partition index: f3'
            in str(err))

    fp = fc.partition({'f1': 'p1', 'f2': 'p1', 'f3': 'p2'})
    assert sorted(fp.keys()) == ['p1', 'p2']
    assert sorted(fp['p1'].keys()) == ['f1', 'f2']
    assert sorted(fp['p2'].keys()) == ['f3']

    assert fc.is_valid()
    for fc in fp.values():
        assert fc.is_valid()
Example #16
0
def test_collection(mfcc):
    assert FeaturesCollection._value_type is Features
    assert FeaturesCollection().is_valid()
    assert FeaturesCollection(mfcc=mfcc).is_valid()
    assert not FeaturesCollection(
        mfcc=Features(np.asarray([0]), 0, validate=False)).is_valid()
Example #17
0
def test_extract_features_full(ext, wav_file, wav_file_8k, wav_file_float32,
                               capsys, tmpdir):
    # difficult case with parallel jobs, different sampling rates,
    # speakers and segments
    index = [('u1', wav_file, 's1', 0, 1),
             ('u2', wav_file_float32, 's2', 1, 1.2),
             ('u3', wav_file_8k, 's1', 1, 3)]
    config = pipeline.get_default_config('mfcc')

    # disable VAD because it can alter the cmvn result (far from (0,
    # 1) when the signal includes non-voiced frames)
    config['cmvn']['with_vad'] = False

    feats = pipeline.extract_features(config,
                                      index,
                                      njobs=2,
                                      log=utils.get_logger())

    # ensure we have the expected log messages
    messages = capsys.readouterr().err
    assert 'INFO - get 3 utterances from 2 speakers in 3 wavs' in messages
    assert 'WARNING - several sample rates found in wav files' in messages

    for utt in ('u1', 'u2', 'u3'):
        assert utt in feats
        assert feats[utt].dtype == np.float32

    # check properies
    p1 = feats['u1'].properties
    p2 = feats['u2'].properties
    p3 = feats['u3'].properties
    assert p1['audio']['file'] == wav_file
    assert p1['audio']['duration'] == 1.0
    assert p2['audio']['file'] == wav_file_float32
    assert p2['audio']['duration'] == pytest.approx(0.2)
    assert p3['audio']['file'] == wav_file_8k
    assert p3['audio']['duration'] < 0.5  # ask 3s but get duration-tstart
    assert p1['mfcc'] == p2['mfcc']
    assert p1['mfcc']['sample_rate'] != p3['mfcc']['sample_rate']
    assert p1.keys() == {
        'audio', 'mfcc', 'cmvn', 'pitch', 'delta', 'speaker', 'pipeline'
    }
    assert p1.keys() == p2.keys() == p3.keys()
    assert p1['pipeline'] == p2['pipeline'] == p3['pipeline']

    # check shape. mfcc*delta + pitch = 13 * 3 + 3 = 42
    assert feats['u1'].shape == (98, 42)
    assert feats['u2'].shape == (18, 42)
    assert feats['u3'].shape == (40, 42)

    # check cmvn
    assert feats['u2'].data[:, :13].mean() == pytest.approx(0.0, abs=1e-6)
    assert feats['u2'].data[:, :13].std() == pytest.approx(1.0, abs=1e-6)

    data = np.vstack((feats['u1'].data[:, :13], feats['u3'].data[:, :13]))
    assert data.mean() == pytest.approx(0.0, abs=1e-6)
    assert data.std() == pytest.approx(1.0, abs=1e-6)
    assert np.abs(data.mean()) <= np.abs(feats['u1'].data[:, :13].mean())
    assert np.abs(data.std() - 1.0) <= np.abs(feats['u1'].data[:, :13].std() -
                                              1.0)
    assert np.abs(data.mean()) <= np.abs(feats['u3'].data[:, :13].mean())
    assert np.abs(data.std() - 1.0) <= np.abs(feats['u3'].data[:, :13].std() -
                                              1.0)

    # save / load the features
    filename = str(tmpdir.join('feats' + ext))
    feats.save(filename)
    feats2 = FeaturesCollection.load(filename)
    assert feats2 == feats
Example #18
0
def mfcc_col(mfcc):
    return FeaturesCollection(mfcc=mfcc)
Example #19
0
def apply_cmvn(feats_collection,
               by_collection=True,
               norm_vars=True,
               weights=None,
               skip_dims=None):
    """CMVN normalization of a collection of features

    This function is a simple wrapper on the class
    :class:`~shennong.features.CmvnPostProcessor` that allows to
    accumulate and apply CMVN statistics over a whole collections of
    features.

    Warnings
    --------
    The features in the collection must have the same
    dimensionality. It is assumed they are all extracted from the same
    processor. If this is not the case, a ValueError is raised.

    Parameters
    ----------
    feats_collection : :class:`~shennong.FeaturesCollection`
        The collection of features on wich to apply CMVN normlization.
        Each features in the collection is assumed to have consistent
        dimensions.

    by_collection : bool, optional
        When True, accumulate and apply CMVN over the entire
        collection. When False, do it independently for each features
        in the collection. Default to True.

    norm_vars : bool, optional
        If False, do not apply variance normalization (only mean),
        default to True.

    weights : dict of arrays, optional
        For each features in the collection, an array of weights to
        apply on the features frames, if specified we must have
        ``weights.keys() == feats_collections.keys()`` (see
        :func:`CmvnPostProcessor.accumulate`). Unweighted by default.

    skip_dims : list of integers
        The dimensions for which to skip the normalization (see
        :func:`CmvnPostProcessor.process`). Default is to normalize
        all dimensions.

    Returns
    -------
    cmvn_feats_collection : :class:`~shennong.features.FeaturesCollection`

    Raises
    ------
    ValueError
        If something goes wrong during CMVN processing.

    """
    # extract the features dimension
    dim = set(f.ndims for f in feats_collection.values())
    if not len(dim) == 1:
        raise ValueError(
            'features in the collection must have consistent dimensions '
            'but dimensions are: {}'.format(sorted(dim)))
    dim = list(dim)[0]

    # check weights
    if weights is not None and weights.keys() != feats_collection.keys():
        raise ValueError('keys differ for weights and features collection')

    # check skip_dims
    if skip_dims is not None:
        sdmin, sdmax = min(skip_dims), max(skip_dims)
        if sdmin < 0 or sdmax >= dim:
            raise ValueError(
                'out of bounds dimensions in skip_dims, must be in [0, {}] '
                'but are in [{}, {}]'.format(dim - 1, sdmin, sdmax))

    if by_collection:
        # accumulate CMVN stats over the whole collection
        cmvn = CmvnPostProcessor(dim)
        for k, f in feats_collection.items():
            w = weights[k] if weights is not None else None
            cmvn.accumulate(f, weights=w)

        # apply CMVN stats
        return FeaturesCollection({
            k: cmvn.process(f, norm_vars=norm_vars, skip_dims=skip_dims)
            for k, f in feats_collection.items()
        })
    else:
        # independently for each features in the collection,
        # accumulate and apply CMNV stats
        cmvn_collection = FeaturesCollection()
        for k, f in feats_collection.items():
            cmvn = CmvnPostProcessor(f.ndims)
            cmvn.accumulate(
                f, weights=weights[k] if weights is not None else None)
            cmvn_collection[k] = cmvn.process(f,
                                              norm_vars=norm_vars,
                                              skip_dims=skip_dims)

        return cmvn_collection