def test_energy(audio, mfcc): # VAD from mfcc or energy is the same excepted properties vad1 = VadPostProcessor().process(EnergyProcessor().process(audio)) vad2 = VadPostProcessor().process(mfcc) vad1 = Features(vad1.data, vad1.times) vad2 = Features(vad2.data, vad2.times) assert vad1 == vad2
def test_tofrom_dict(mfcc): a = mfcc._to_dict() b = Features._from_dict(a) assert b == mfcc with pytest.raises(ValueError) as err: Features._from_dict({'data': a['data'], 'properties': a['properties']}) assert 'missing keys: times' in str(err)
def test_1d_times_sorted(): # 10 frames, 5 dims data = np.random.random((10, 5)) p = MfccProcessor() times = p.times(10) assert times.shape == (10, 2) feats = Features(data, times[:, 1], validate=False) assert feats.is_valid()
def test_collection_isclose(): f1 = Features(np.random.random((10, 2)), np.ones((10, ))) f2 = Features(np.random.random((10, 2)), np.ones((10, ))) fc1 = FeaturesCollection(f1=f1, f2=f2) fc2 = FeaturesCollection(f1=f1, f2=Features(f2.data + 1, f2.times)) fc3 = FeaturesCollection(f1=f1, f3=f2) assert fc1.is_close(fc1) assert not fc1.is_close(fc2) assert fc1.is_close(fc2, atol=1) assert not fc1.is_close(fc3)
def mfcc_utf8(mfcc): props = mfcc.properties props['comments'] = '使用人口について正確な統計はないが、日本国' feats = FeaturesCollection() feats['æðÐ'] = Features(mfcc.data, mfcc.times, props) return feats
def process(self, features): """Computes voice activity detection (VAD) on the input `features` Parameters ---------- features : :class:`~shennong.features.features.Features`, shape = [n,m] The speech features on which to look for voiced frames. The first coefficient must be a log-energy (or equivalent). Works well with :class:`~shennong.features.processor.mfcc.MfccProcessor` and :class:`~shennong.features.processor.plp.PlpProcessor`. Returns ------- vad : :class:`~shennong.features.features.Features`, shape = [n,1] The output vad features are of dtype uint8 and contain 1 for voiced frames or 0 for unvoiced frames. """ data = kaldi.matrix.SubVector( kaldi.ivector.compute_vad_energy( self._options, kaldi.matrix.SubMatrix(features.data))).numpy() return Features(np.atleast_2d(data.astype(np.uint8)).T, features.times, properties=self.get_properties(features))
def _process(self, cls, signal, vtln_warp): """Inner process method common to all Kaldi Mel processors""" # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # we need to forward options (because the assignation here is # done by copy, not by reference. If the user do 'p = # Processor(); p.dither = 0', this is forwarded to Kaldi here) self._options.frame_opts = self._frame_options self._options.mel_opts = self._mel_options # force 16 bits integers signal = signal.astype(np.int16).data data = kaldi.matrix.SubMatrix( cls(self._options).compute(kaldi.matrix.SubVector(signal), vtln_warp)).numpy() return Features(data, self.times(data.shape[0]), properties=self.get_properties())
def test_save_invalid(tmpdir, mfcc): f = str(tmpdir.join('foo.json')) h = serializers.get_serializer(FeaturesCollection, f, None) feats = FeaturesCollection( mfcc=Features(data=mfcc.data, times=0, validate=False)) with pytest.raises(ValueError) as err: h.save(feats) assert 'features are not valid' in str(err.value)
def test_apply_baddim(features_collection): feats = FeaturesCollection( {k: v.copy() for k, v in features_collection.items()}) feats['new'] = Features( np.random.random((2, 1)), np.asarray([0, 1])) with pytest.raises(ValueError) as err: apply_cmvn(feats) assert 'must have consistent dimensions' in str(err.value)
def features_collection(): # build a collection of 3 random features of same ndims, various # nframes dim = 10 feats = FeaturesCollection() for n in range(3): nframes = np.random.randint(5, 15) feats[str(n)] = Features(np.random.random((nframes, dim)), np.arange(0, nframes)) return feats
def test_partition(): f1 = Features(np.random.random((10, 2)), np.ones((10, ))) f2 = Features(np.random.random((5, 2)), np.ones((5, ))) f3 = Features(np.random.random((5, 2)), np.ones((5, ))) fc = FeaturesCollection(f1=f1, f2=f2, f3=f3) with pytest.raises(ValueError) as err: fp = fc.partition({'f1': 'p1', 'f2': 'p1'}) assert ('following items are not defined in the partition index: f3' in str(err)) fp = fc.partition({'f1': 'p1', 'f2': 'p1', 'f3': 'p2'}) assert sorted(fp.keys()) == ['p1', 'p2'] assert sorted(fp['p1'].keys()) == ['f1', 'f2'] assert sorted(fp['p2'].keys()) == ['f3'] assert fc.is_valid() for fc in fp.values(): assert fc.is_valid()
def process(self, signal, vtln_warp=1.0): """Compute spectrogram with the specified options Do an optional feature-level vocal tract length normalization (VTLN) when `vtln_warp` != 1.0. Parameters ---------- signal : Audio, shape = [nsamples, 1] The input audio signal to compute the features on, must be mono vtln_warp : float, optional The VTLN warping factor to be applied when computing features. Be 1.0 by default, meaning no warping is to be done. Returns ------- features : `Features`, shape = [nframes, `ndims`] The computed features, output will have as many rows as there are frames (depends on the specified options `frame_shift` and `frame_length`). Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # we need to forward options (because the assignation here is # done by copy, not by reference. If the user do 'p = # Processor(); p.dither = 0', this is forwarded to Kaldi here) self._options.frame_opts = self._frame_options # force 16 bits integers signal = signal.astype(np.int16).data data = kaldi.matrix.SubMatrix( kaldi.feat.spectrogram.Spectrogram(self._options).compute( kaldi.matrix.SubVector(signal), vtln_warp)).numpy() return Features(data, self.times(data.shape[0]), properties=self.get_properties())
def test_post_pitch(raw_pitch): post_processor = PitchPostProcessor() params = post_processor.get_params() data = post_processor.process(raw_pitch) assert data.shape[1] == 3 assert raw_pitch.shape[0] == data.shape[0] assert np.array_equal(raw_pitch.times, data.times) assert params == post_processor.get_params() bad_pitch = Features(np.random.random((raw_pitch.nframes, 1)), raw_pitch.times) with pytest.raises(ValueError) as err: post_processor.process(bad_pitch) assert 'data shape must be (_, 2), but it is (_, 1)' in str(err.value) bad_pitch = Features(np.random.random((raw_pitch.nframes, 3)), raw_pitch.times) with pytest.raises(ValueError) as err: post_processor.process(bad_pitch) assert 'data shape must be (_, 2), but it is (_, 3)' in str(err.value)
def test_concatenate_tolerance(capsys): get_logger(level='info') f1 = Features(np.random.random((12, 2)), np.ones((12, ))) f2 = Features(np.random.random((10, 2)), np.ones((10, ))) with pytest.raises(ValueError) as err: f1.concatenate(f2, tolerance=0) assert 'features have a different number of frames' in str(err) with pytest.raises(ValueError) as err: f1.concatenate(f2, tolerance=1) assert 'features differs number of frames, and greater than ' in str(err) f3 = f1.concatenate(f2, tolerance=2) assert f3.shape == (10, 4) assert 'WARNING' in capsys.readouterr().err f3 = f2.concatenate(f1, tolerance=2) assert f3.shape == (10, 4) assert 'WARNING' in capsys.readouterr().err
def test_concatenate(mfcc): mfcc2 = mfcc.concatenate(mfcc) assert mfcc2.nframes == mfcc.nframes assert mfcc2.ndims == mfcc.ndims * 2 assert mfcc2.properties != mfcc.properties assert mfcc2.properties['mfcc'] == mfcc.properties['mfcc'] mfcc2 = Features(mfcc.data, mfcc.times + 1) with pytest.raises(ValueError) as err: mfcc.concatenate(mfcc2) assert 'times are not equal' in str(err)
def test_init_bad(): with pytest.raises(ValueError) as err: Features(0, 0, properties=0) assert 'data must be a numpy array' in str(err) with pytest.raises(ValueError) as err: Features(np.asarray([0]), 0, properties=0) assert 'times must be a numpy array' in str(err) with pytest.raises(ValueError) as err: Features(np.asarray([0]), np.asarray([0]), properties=0) assert 'properties must be a dictionnary' in str(err) with pytest.raises(ValueError) as err: Features(np.asarray([0]), np.asarray([0]), properties={0: 0}) assert 'data dimension must be 2' in str(err) with pytest.raises(ValueError) as err: Features(np.asarray([[0], [0]]), np.random.random((2, 2, 2))) assert 'times dimension must be 1 or 2' in str(err) with pytest.raises(ValueError) as err: data = np.random.random((12, 2)) data[2, 1] = np.nan Features(data, np.ones((12, ))) assert 'data contains non-finite numbers' in str(err)
def process(self, alignment): # build a bijection token <-> onehot index token2index = self._token2index(alignment) # sample the alignment at the requested sample rate sampled = alignment.at_sample_rate(self.frame.sample_rate) # get the frames as pairs (istart:istop) nframes = self.frame.nframes(sampled.shape[0]) frame_boundaries = self.frame.boundaries(nframes) # allocate the features data data = np.zeros((frame_boundaries.shape[0], len(token2index)), dtype=np.bool) # allocate the window function window = shennong.features.window.window( self.frame.samples_per_frame, type=self.window_type, blackman_coeff=self.blackman_coeff) for i, (onset, offset) in enumerate(frame_boundaries): framed = sampled[onset:offset] # the frame is made of a single token, no needs to compute # a window function if np.all(framed[0] == framed[1:]): winner = framed[0] else: # several tokens in the frame, compute the weights weights = collections.defaultdict(int) for j, w in enumerate(window): weights[framed[j]] += w # the winner token has the biggest weight winner = sorted(weights.items(), key=operator.itemgetter(1), reverse=True)[0][0] data[i, token2index[winner]] = 1 try: properties = self.get_properties() except ValueError: # tokens not defined self.tokens = token2index.keys() properties = self.get_properties() self.tokens = None properties[self.name].update({'token2index': token2index}) return Features(data, frame_boundaries / self.frame.sample_rate, properties=properties)
def test_times_1d(serializer, tmpdir): filename = ('feats.ark' if serializer is serializers.KaldiSerializer else 'feats') tmpfile = str(tmpdir.join(filename)) p = MfccProcessor() times = p.times(10)[:, 1] assert times.shape == (10, ) col = FeaturesCollection(mfcc=Features(np.random.random((10, 5)), times)) serializer(col.__class__, tmpfile).save(col) col2 = serializer(col.__class__, tmpfile).load() assert col == col2
def test_equal(mfcc): # same object assert mfcc == mfcc assert mfcc.is_close(mfcc) # same data mfcc2 = mfcc.copy() assert mfcc == mfcc2 assert mfcc.is_close(mfcc2) # not same shape mfcc2 = mfcc.concatenate(mfcc) assert not mfcc == mfcc2 assert not mfcc.is_close(mfcc2) # not same dtype mfcc64 = mfcc.copy(dtype=np.float64) assert not mfcc == mfcc64 assert mfcc.is_close(mfcc64) # not same properties mfcc2 = Features(mfcc.data, mfcc.times, properties={'foo': 0}) assert not mfcc == mfcc2 assert not mfcc.is_close(mfcc2) # not same times mfcc2 = Features(mfcc.data, mfcc.times + 1, properties=mfcc.properties) assert not mfcc == mfcc2 assert not mfcc.is_close(mfcc2) # not same data mfcc2 = Features(mfcc.data + 1, mfcc.times, properties=mfcc.properties) assert not mfcc == mfcc2 assert not mfcc.is_close(mfcc2) # not same data but close mfcc2 = Features(mfcc.data + 1, mfcc.times, properties=mfcc.properties) assert not mfcc == mfcc2 assert mfcc.is_close(mfcc2, atol=1) # not same times but close mfcc2 = Features(mfcc.data, mfcc.times + 1, properties=mfcc.properties) assert not mfcc == mfcc2 assert not mfcc.is_close(mfcc2, atol=1)
def process(self, raw_pitch): """Post process a raw pitch data as specified by the options Parameters ---------- raw_pitch : Features, shape = [n, 2] The pitch as extracted by the `PitchProcessor.process` method Returns ------- pitch : Features, shape = [n, 1 2 3 or 4] The post-processed pitch usable as speech features. The output columns are 'pov_feature', 'normalized_log_pitch', delta_pitch' and 'raw_log_pitch', in that order,if their respective options are set to True. Raises ------ ValueError If `raw_pitch` has not exactly two columns. If all the following options are False: 'add_pov_feature', 'add_normalized_log_pitch', 'add_delta_pitch' and 'add_raw_log_pitch' (at least one of them must be True). """ # check at least one required option is True if not (self.add_pov_feature or self.add_normalized_log_pitch or self.add_delta_pitch or self.add_raw_log_pitch): raise ValueError( 'at least one of the following options must be True: ' 'add_pov_feature, add_normalized_log_pitch, ' 'add_delta_pitch, add_raw_log_pitch') if raw_pitch.shape[1] != 2: raise ValueError( 'data shape must be (_, 2), but it is (_, {})' .format(raw_pitch.shape[1])) data = kaldi.matrix.SubMatrix( kaldi.feat.pitch.process_pitch( self._options, kaldi.matrix.SubMatrix(raw_pitch.data))).numpy() return Features( data, raw_pitch.times, properties=self.get_properties(raw_pitch))
def process(self, signal): """Extracts the (NCCF, pitch) from a given speech `signal` Parameters ---------- signal : Audio The speech signal on which to estimate the pitch. The signal's sample rate must match the sample rate specified in the `PitchProcessor` options. Returns ------- raw_pitch_features : Features, shape = [nframes, 2] The output array has as many rows as there are frames (depends on the specified options `frame_shift` and `frame_length`), and two columns corresponding to (NCCF, pitch). Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ if signal.nchannels != 1: raise ValueError( 'audio signal must have one channel, but it has {}' .format(signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError( 'processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # force 16 bits integers signal = signal.astype(np.int16).data data = kaldi.matrix.SubMatrix( kaldi.feat.pitch.compute_kaldi_pitch( self._options, kaldi.matrix.SubVector(signal))).numpy() return Features( data, self.times(data.shape[0]), properties=self.get_properties())
def test_copy(mfcc): # by copy we allocate new arrays mfcc2 = mfcc.copy() assert mfcc2 == mfcc assert mfcc2 is not mfcc assert mfcc2.data is not mfcc.data assert mfcc2.times is not mfcc.times assert mfcc2.properties is not mfcc.properties # by explicit construction the arrays are shared mfcc2 = Features(mfcc.data, mfcc.times, properties=mfcc.properties, validate=False) assert mfcc2 == mfcc assert mfcc2 is not mfcc assert mfcc2.data is mfcc.data assert mfcc2.times is mfcc.times assert mfcc2.properties is mfcc.properties
def process(self, signal): # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # force the signal to be int16 signal = signal.astype(np.int16) # extract the features data = self._rastaplp(signal) return Features(data.T.astype(np.float32), self.times(data.T.shape[0]), properties=self.get_properties())
def process(self, features): """Compute deltas on `features` with the specified options Parameters ---------- features : Features, shape = [nframes, ncols] The input features on which to compute the deltas Returns ------- deltas : Features, shape = [nframes, ncols * (`order` + 1)] The computed deltas with as much orders as specified. The output features are the concatenation of the input `features` and it's time derivative at each orders. """ data = kaldi.matrix.SubMatrix( kaldi.feat.functions.compute_deltas( self._options, kaldi.matrix.SubMatrix(features.data))).numpy() return Features(data, features.times, self.get_properties(features))
def process(self, alignment): # build a bijection token <-> onehot index token2index = self._token2index(alignment) # initialize the data matrix with zeros, TODO should data be a # scipy.sparse matrix? data = np.zeros((alignment.tokens.shape[0], len(token2index)), dtype=np.bool) # fill the data with onehot encoding of tokens for i, p in enumerate(alignment.tokens): data[i, token2index[p]] = 1 try: properties = self.get_properties() except ValueError: # tokens not defined self.tokens = token2index.keys() properties = self.get_properties() self.tokens = None properties[self.name].update({'token2index': token2index}) return Features(data, alignment.times, properties=properties)
def process(self, signal): """Computes energy on the input `signal` Parameters ---------- signal : :class:`~signal.audio.audioData` Returns ------- energy : :class:`~shennong.features.features.Features` The computed - and compressed - energy Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) if self.raw_energy: old_conf = self.get_params() self.preemph_coeff = 0 self.window_type = 'rectangular' # number of frames in the framed signal nframes = kaldi.feat.window.num_frames(signal.nsamples, self._frame_options, flush=True) # a kaldi view of the numpy signal signal = kaldi.matrix.SubVector(signal.data) # windowing function to compute frames window = kaldi.feat.window.FeatureWindowFunction.from_options( self._frame_options) # compression function to compress energy compression = self._compression_fun[self._compression] # pre-allocate the resulting energy energy = np.zeros((nframes, 1)) # pre-allocate a buffer for the frames, extract the frames and # compute the energy on them out_frame = kaldi.matrix.Vector(self._frame_options.window_size()) for frame in range(nframes): kaldi.feat.window.extract_window(0, signal, frame, self._frame_options, window, out_frame) # square the signal, force float64 to avoid overflow square = np.square(out_frame.numpy(), dtype=np.float64) # avoid doing log on 0 (should be avoided already by # dithering, but who knows...) energy[frame] = compression( max(square.sum(), np.finfo(np.float64).tiny)) if self.raw_energy: self.set_params(**old_conf) return Features(energy, self.times(nframes), self.get_properties())
def process(self, signal): """Computes bottleneck features on an audio `signal` Use a pre-trained neural network to extract bottleneck features. Features have a frame shift of 10 ms and frame length of 25 ms. Parameters ---------- signal : Audio, shape = [nsamples, 1] The input audio signal to compute the features on, must be mono. The signal is up/down-sampled at 8 kHz during processing. Returns ------- features : Features, shape = [nframes, 80] The computes bottleneck features will have as many rows as there are frames (depends on the `signal` duration, expect about 100 frames per second), each frame with 80 dimensions. Raises ------ RuntimeError If no speech is detected on the `signal` during the voice activity detection preprocessing step. """ # force resampling to 8 kHz and 16 bits integers need_resample = (signal.sample_rate != 8000 or signal.dtype is not np.dtype(np.int16)) if need_resample: self._log.debug('resampling audio from %dHz@%db to %dHz@%db', signal.sample_rate, signal.dtype.itemsize * 8, 8000, 16) signal = signal.resample(8000).astype(np.int16) signal = signal.data # define parameters to extract mel filterbanks. Those # parameters cannot be tuned because the networks are trained # with them... frame_noverlap is the number of samples to # overlap in each frame, so the frame_shift is 200 - 120 = 80 frame_length = 200 frame_noverlap = 120 frame_shift = frame_length - frame_noverlap # voice activity detection TODO implement user-provided VAD # (vad input format could be an instance of Alignment, or # simply an array of bool). vad = _compute_vad(signal, self._log, win_length=frame_length, win_overlap=frame_noverlap) # ensure we have some voiced frames in the signal voiced_frames = sum(vad) if not voiced_frames: raise RuntimeError( 'no voice detected in signal, failed to extract features') self._log.debug('%d frames of speech detected (on %d total frames)', voiced_frames, len(vad)) # from audio signal to mel filterbank signal = _add_dither(signal, self.dither) window = np.hamming(frame_length) fbank_mx = _mel_fbank_mx(window.size, 8000, numchans=24, lofreq=64.0, hifreq=3800.0) fea = _fbank_htk(signal, window, frame_noverlap, fbank_mx) # center the mel features from voiced frames mean fea -= np.mean(fea[vad], axis=0) # add a global context to the mel features left_ctx = right_ctx = 15 fea = np.r_[np.repeat(fea[[0]], left_ctx, axis=0), fea, np.repeat(fea[[-1]], right_ctx, axis=0)] # compute the network output from mel features left_ctx_bn1 = right_ctx_bn1 = self._get_weights()['context'] nn_input = _preprocess_nn_input(fea, left_ctx_bn1, right_ctx_bn1) nn_output = np.vstack( _create_nn_extract_st_BN(nn_input, self._get_weights(), 2)[0]) # compute the timestamps for each output frame times = (1.0 / 8000) * np.vstack( (np.arange(nn_output.shape[0]) * frame_shift, np.arange(nn_output.shape[0]) * frame_shift + frame_length)).T # return the final bottleneck features return Features(nn_output, times, self.get_properties())
def test_2d_times_unsorted(): with pytest.raises(ValueError) as err: Features(np.random.random((10, 3)), np.random.random((10, 2))) assert 'times is not sorted in increasing order' in str(err)
def test_collection(mfcc): assert FeaturesCollection._value_type is Features assert FeaturesCollection().is_valid() assert FeaturesCollection(mfcc=mfcc).is_valid() assert not FeaturesCollection( mfcc=Features(np.asarray([0]), 0, validate=False)).is_valid()
def test_2d_times_badshape(): with pytest.raises(ValueError) as err: Features(np.random.random((10, 3)), np.random.random((10, 3))) assert 'times shape[1] must be 2, it is 3' in str(err)