def process(self, features): """Computes voice activity detection (VAD) on the input `features` Parameters ---------- features : :class:`~shennong.features.Features`, shape = [n,m] The speech features on which to look for voiced frames. The first coefficient must be a log-energy (or equivalent). Works well with :class:`~shennong.processor.mfcc.MfccProcessor` and :class:`~shennong.processor.plp.PlpProcessor`. Returns ------- vad : :class:`~shennong.features.Features`, shape = [n,1] The output vad features are of dtype uint8 and contain 1 for voiced frames or 0 for unvoiced frames. """ data = kaldi.matrix.SubVector( kaldi.ivector.compute_vad_energy( self._options, kaldi.matrix.SubMatrix(features.data))).numpy() return Features(np.atleast_2d(data.astype(np.uint8)).T, features.times, properties=self.get_properties(features))
def trim(self, vad): """Returns a new instance of FeaturesCollection where each features has been trimmed with the corresponding VAD. Parameters ---------- vad : dict of boolean ndarrays A dictionnary of arrays indicating which frame to keep. Returns ------- features: FeaturesCollection A new FeaturesCollection trimmed with the input VAD Raises ------ ValueError If the utterances are not the same. If the VAD arrays are not boolean arrays. """ if vad.keys() != self.keys(): raise ValueError('Vad keys are different from this keys.') for key in vad.keys(): if vad[key].dtype != np.dtype('bool'): raise ValueError('Vad arrays must be arrays of bool.') if vad[key].shape[0] != self[key].nframes: raise ValueError( 'Vad arrays length must be equal to the number of frames.') return FeaturesCollection({ k: Features( self[k].data[vad[k]], self[k].times[vad[k]], properties=self[k].properties) for k in self.keys()})
def _process(self, cls, signal, vtln_warp): """Inner process method common to all Kaldi Mel processors""" # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # we need to forward options (because the assignation here is # done by copy, not by reference. If the user do 'p = # Processor(); p.dither = 0', this is forwarded to Kaldi here) self._options.frame_opts = self._frame_options self._options.mel_opts = self._mel_options # force 16 bits integers signal = signal.astype(np.int16).data data = kaldi.matrix.SubMatrix( cls(self._options).compute(kaldi.matrix.SubVector(signal), vtln_warp)).numpy() return Features(data, self.times(data.shape[0]), properties=self.get_properties(vtln_warp=vtln_warp))
def process_one(self, wav): # frame shift of 25.6ms to be online with ground truth audio = amfm_decompy.basic_tools.SignalObj(wav) pitch = amfm_decompy.pYAAPT.yaapt( audio, frame_length=25.6, frame_space=10) return Features( np.atleast_2d(pitch.samp_values).T, pitch.frames_pos / audio.fs)
def _load(self): self._log.info('loading %s', self.filename) data = np.load(open(self.filename, 'rb'), allow_pickle=True)['features'].tolist() features = self._features_collection() for k, v in data.items(): features[k] = Features._from_dict(v, validate=False) return features
def _load(self): # loading properties filename = self._fileroot + '.properties.json' self._log.info('loading %s', filename) if not os.path.isfile(filename): raise IOError('file not found: {}'.format(filename)) properties = json_tricks.loads(open(filename, 'r').read()) # loading times ark = self._fileroot + '.times.ark' self._log.info('loading %s', ark) if not os.path.isfile(ark): raise IOError('file not found: {}'.format(ark)) rspecifier = 'ark:' + ark with kaldi.util.table.SequentialDoubleMatrixReader( rspecifier) as reader: times = {k: v.numpy() for k, v in reader} # postprocess times: do 2d->1d if they are 1d vectors for key, value in times.items(): if value.shape[0] == 1: times[key] = value.reshape((value.shape[1])) # loading features ark = self._fileroot + '.ark' self._log.info('loading %s', ark) # rspecifier = 'ark,scp:' + ark + ',' + scp rspecifier = 'ark:' + ark with kaldi.util.table.SequentialDoubleMatrixReader( rspecifier) as reader: data = {k: v.numpy() for k, v in reader} if properties.keys() != data.keys(): raise ValueError( 'invalid features: items differ in data and properties') if times.keys() != data.keys(): raise ValueError( 'invalid features: items differ in data and times') return self._features_collection( **{ k: Features(data[k].astype(properties[k]['__dtype_data__']), times[k].astype(properties[k]['__dtype_times__']), properties={ k: p for k, p in properties[k].items() if '__dtype_' not in k }, validate=False) for k in data.keys() })
def process(self, signal): """Compute spectrogram with the specified options Parameters ---------- signal : Audio, shape = [nsamples, 1] The input audio signal to compute the features on, must be mono Returns ------- features : `Features`, shape = [nframes, `ndims`] The computed features, output will have as many rows as there are frames (depends on the specified options `frame_shift` and `frame_length`). Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ # whereas Kaldi (and so pykaldi) exposes a vtln_warp parameter for # spectrograms, it is only present for compatibility and has no effect. # See https://github.com/kaldi-asr/kaldi/blob # /598ad3a400a70b934485f577354b19ee04dd8636/src/feat/feature-spectrogram.h#L97. # So this parameter is not exposed in shennong and we forward a # "neutral" (1.0) VTLN warp to pykaldi. # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # we need to forward options (because the assignation here is # done by copy, not by reference. If the user do 'p = # Processor(); p.dither = 0', this is forwarded to Kaldi here) self._options.frame_opts = self._frame_options # force 16 bits integers signal = signal.astype(np.int16).data data = kaldi.matrix.SubMatrix( kaldi.feat.spectrogram.Spectrogram(self._options).compute( kaldi.matrix.SubVector(signal), 1.0)).numpy() return Features(data, self.times(data.shape[0]), properties=self.get_properties())
def process_one(self, wav): audio = Audio.load(wav) raw = parselmouth.Sound( audio.data, sampling_frequency=audio.sample_rate).to_pitch() times = self.frames.times(audio.nsamples) # linear interpolation of Praat pitch to be on same timestamps # thant ground truth and other models pitch = np.atleast_2d(np.nan_to_num(np.asarray( [raw.get_value_at_time(t) for t in times.mean(axis=1)]))).T return Features(pitch, times)
def process(self, signal, vtln_warp=1.0): """Compute Rasta-PLP features with the specified options Do an optional feature-level vocal tract length normalization (VTLN) when `vtln_warp` != 1.0. Parameters ---------- signal : Audio, shape = [nsamples, 1] The input audio signal to compute the features on, must be mono vtln_warp : float, optional The VTLN warping factor to be applied when computing features. Be 1.0 by default, meaning no warping is to be done. Returns ------- features : `Features`, shape = [nframes, `ndims`] The computed features, output will have as many rows as there are frames (depends on the specified options `frame_shift` and `frame_length`). Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # extract the PLP features self._reset_buffers() data = self._compute(signal, vtln_warp) return Features(data, self.times(data.shape[0]), properties=self.get_properties(vtln_warp=vtln_warp), validate=False)
def _load(self): self._log.info('loading %s', self.filename) data = h5features.Reader(self.filename, groupname='features').read() features = self._features_collection() for n in range(len(data.items())): features[data.items()[n]] = Features( data.features()[n], data.labels()[n], properties=(data.properties()[n] if data.has_properties() else {}), validate=False) return features
def _load(self): self._log.info('loading %s', self.filename) data = self._check_keys( scipy.io.loadmat(self.filename, appendmat=False, squeeze_me=True, mat_dtype=True, struct_as_record=False)) features = self._features_collection() for k, v in data.items(): if k not in ('__header__', '__version__', '__globals__'): if 'properties' in v: features[k] = Features( v['data'], v['times'], self._make_list(self._check_keys(v['properties'])), validate=False) else: features[k] = Features(v['data'], v['times'], validate=False) return features
def process(self, raw_pitch): """Post process a raw pitch data as specified by the options Parameters ---------- raw_pitch : Features, shape = [n, 2] The pitch as extracted by the `KaldiPitchProcessor.process` method Returns ------- pitch : Features, shape = [n, 1 2 3 or 4] The post-processed pitch usable as speech features. The output columns are 'pov_feature', 'normalized_log_pitch', delta_pitch' and 'raw_log_pitch', in that order,if their respective options are set to True. Raises ------ ValueError If `raw_pitch` has not exactly two columns. If all the following options are False: 'add_pov_feature', 'add_normalized_log_pitch', 'add_delta_pitch' and 'add_raw_log_pitch' (at least one of them must be True). """ # check at least one required option is True if not (self.add_pov_feature or self.add_normalized_log_pitch or self.add_delta_pitch or self.add_raw_log_pitch): raise ValueError( 'at least one of the following options must be True: ' 'add_pov_feature, add_normalized_log_pitch, ' 'add_delta_pitch, add_raw_log_pitch') if raw_pitch.shape[1] != 2: raise ValueError( 'data shape must be (_, 2), but it is (_, {})'.format( raw_pitch.shape[1])) data = kaldi.matrix.SubMatrix( kaldi.feat.pitch.process_pitch( self._options, kaldi.matrix.SubMatrix(raw_pitch.data))).numpy() return Features(data, raw_pitch.times, properties=self.get_properties(raw_pitch))
def process(self, signal): """Extracts the (NCCF, pitch) from a given speech `signal` Parameters ---------- signal : Audio The speech signal on which to estimate the pitch. The signal's sample rate must match the sample rate specified in the `PitchProcessor` options. Returns ------- raw_pitch_features : Features, shape = [nframes, 2] The output array has as many rows as there are frames (depends on the specified options `frame_shift` and `frame_length`), and two columns corresponding to (NCCF, pitch). Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ if signal.nchannels != 1: raise ValueError( 'audio signal must have one channel, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # force 16 bits integers signal = signal.astype(np.int16).data data = kaldi.matrix.SubMatrix( kaldi.feat.pitch.compute_kaldi_pitch( self._options, kaldi.matrix.SubVector(signal))).numpy() return Features(data, self.times(data.shape[0]), properties=self.get_properties())
def prepare_ground_truth(data_directory): """Retrieves pitch ground truth for evaluations""" output_file = data_directory / 'pitch' / 'ground_truth.h5f' output_file.parent.mkdir(parents=True, exist_ok=True) if output_file.is_file(): return print('retrieving pitch ground truth...') truth = FeaturesCollection() for pitch in data_directory.glob('raw/KEELE/**/pitch.npy'): data = np.load(pitch) truth[pitch.parent.stem[:2]] = Features( np.atleast_2d(data['pitch']).T, # from https://lost-contact.mit.edu/afs/nada.kth.se/dept/tmh/ # corpora/KeelePitchDB/Speech/keele_pitch_database.htm we have # pitch computed on 10ms steps over 25.6ms windows. Here we shift # time from frame beginning to middle time. data['time'] + 0.0128) truth.save(output_file)
def process(self, features): """Applies sliding-window cepstral mean and/or variance normalization on `features` with the specified options Parameters ---------- features : :class:`~shennong.features.Features` The input features. Returns ------- slid_window_cmvn_feats : :class:`~shennong.features.Features` The normalized features. """ data = kaldi.matrix.Matrix(*features.data.shape) kaldi.feat.functions.sliding_window_cmn( self._options, kaldi.matrix.SubMatrix(features.data), data) return Features(data.numpy(), features.times, self.get_properties(features))
def _load(self): self._log.info('loading directory "%s"', self.filename) # list all the csv and json files csv_files = list_files_with_extension(self.filename, '.csv', recursive=False) json_files = list_files_with_extension(self.filename, '.json', recursive=False) features = self._features_collection() # load the features one by one for csv in csv_files: self._log.debug('loading %s', csv) data_dtype, times_dtype, ndims = self._parse_header(csv) # read times and features data = np.loadtxt(csv) times = data[:, :data.shape[1] - ndims].astype(times_dtype) if times.shape[1] == 1: times = times.flatten() data = data[:, data.shape[1] - ndims:].astype(data_dtype) # read properties properties = {} json = csv.replace('.csv', '.json') if json in json_files: self._log.debug('loading %s', json) properties = dict(json_tricks.loads(open(json, 'r').read())) # build the features name = os.path.basename(csv).replace('.csv', '') features[name] = Features(data, times, properties=properties, validate=False) return features
def process(self, features): """Compute deltas on `features` with the specified options Parameters ---------- features : Features, shape = [nframes, ncols] The input features on which to compute the deltas Returns ------- deltas : Features, shape = [nframes, ncols * (`order` + 1)] The computed deltas with as much orders as specified. The output features are the concatenation of the input `features` and it's time derivative at each orders. """ data = kaldi.matrix.SubMatrix( kaldi.feat.functions.compute_deltas( self._options, kaldi.matrix.SubMatrix(features.data))).numpy() return Features(data, features.times, self.get_properties(features))
def process(self, signal): """Computes energy on the input `signal` Parameters ---------- signal : :class:`~signal.audio.audioData` Returns ------- energy : :class:`~shennong.features.Features` The computed - and compressed - energy Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ # ensure the signal is correct if signal.nchannels != 1: raise ValueError( 'signal must have one dimension, but it has {}'.format( signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError('processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) if self.raw_energy: old_conf = self.get_params() self.preemph_coeff = 0 self.window_type = 'rectangular' # number of frames in the framed signal nframes = kaldi.feat.window.num_frames(signal.nsamples, self._frame_options, flush=True) # a kaldi view of the numpy signal signal = kaldi.matrix.SubVector(signal.data) # windowing function to compute frames window = kaldi.feat.window.FeatureWindowFunction.from_options( self._frame_options) # compression function to compress energy compression = self._compression_fun[self._compression] # pre-allocate the resulting energy energy = np.zeros((nframes, 1)) # pre-allocate a buffer for the frames, extract the frames and # compute the energy on them out_frame = kaldi.matrix.Vector(self._frame_options.window_size()) for frame in range(nframes): kaldi.feat.window.extract_window(0, signal, frame, self._frame_options, window, out_frame) # square the signal, force float64 to avoid overflow square = np.square(out_frame.numpy(), dtype=np.float64) # avoid doing log on 0 (should be avoided already by # dithering, but who knows...) energy[frame] = compression( max(square.sum(), np.finfo(np.float64).tiny)) if self.raw_energy: self.set_params(**old_conf) return Features(energy, self.times(nframes), self.get_properties())
def process(self, signal): """Computes bottleneck features on an audio `signal` Use a pre-trained neural network to extract bottleneck features. Features have a frame shift of 10 ms and frame length of 25 ms. Parameters ---------- signal : Audio, shape = [nsamples, 1] The input audio signal to compute the features on, must be mono. The signal is up/down-sampled at 8 kHz during processing. Returns ------- features : Features, shape = [nframes, 80] The computes bottleneck features will have as many rows as there are frames (depends on the `signal` duration, expect about 100 frames per second), each frame with 80 dimensions. Raises ------ RuntimeError If no speech is detected on the `signal` during the voice activity detection preprocessing step. """ # force resampling to 8 kHz and 16 bits integers need_resample = (signal.sample_rate != 8000 or signal.dtype is not np.dtype(np.int16)) if need_resample: self.log.debug('resampling audio from %dHz@%db to %dHz@%db', signal.sample_rate, signal.dtype.itemsize * 8, 8000, 16) signal = signal.resample(8000).astype(np.int16) signal = signal.data # define parameters to extract mel filterbanks. Those # parameters cannot be tuned because the networks are trained # with them... frame_noverlap is the number of samples to # overlap in each frame, so the frame_shift is 200 - 120 = 80 frame_length = 200 frame_noverlap = 120 frame_shift = frame_length - frame_noverlap # voice activity detection TODO implement user-provided VAD # (vad input format could be an instance of Alignment, or # simply an array of bool). vad = _compute_vad(signal, self.log, win_length=frame_length, win_overlap=frame_noverlap) # ensure we have some voiced frames in the signal voiced_frames = sum(vad) if not voiced_frames: raise RuntimeError( 'no voice detected in signal, failed to extract features') self.log.debug('%d frames of speech detected (on %d total frames)', voiced_frames, len(vad)) # from audio signal to mel filterbank signal = _add_dither(signal, self.dither) window = np.hamming(frame_length) fbank_mx = _mel_fbank_mx(window.size, 8000, numchans=24, lofreq=64.0, hifreq=3800.0) fea = _fbank_htk(signal, window, frame_noverlap, fbank_mx) # center the mel features from voiced frames mean fea -= np.mean(fea[vad], axis=0) # add a global context to the mel features left_ctx = right_ctx = 15 fea = np.r_[np.repeat(fea[[0]], left_ctx, axis=0), fea, np.repeat(fea[[-1]], right_ctx, axis=0)] # compute the network output from mel features left_ctx_bn1 = right_ctx_bn1 = self._get_weights()['context'] nn_input = _preprocess_nn_input(fea, left_ctx_bn1, right_ctx_bn1) nn_output = np.vstack( _create_nn_extract_st_BN(nn_input, self._get_weights(), 2)[0]) # compute the timestamps for each output frame times = (1.0 / 8000) * np.vstack( (np.arange(nn_output.shape[0]) * frame_shift, np.arange(nn_output.shape[0]) * frame_shift + frame_length)).T # return the final bottleneck features return Features(nn_output, times, self.get_properties())
def process(self, features, norm_vars=True, skip_dims=None, reverse=False): """Applies the accumulated CMVN statistics to the given ``features`` Parameters ---------- features : :class:`~shennong.features.features.Features` The input features on which to apply CMVN statisitics. norm_vars : bool, optional If False, do not apply variance normalization (only mean), default to True. skip_dims : list of positive integers, optional Dimensions for which to skip normalization. Default is to not skip any dimension. reverse : bool, optional Whether to apply CMVN in a reverse sense, so as to transform zero-mean, unit-variance features into features with the desired mean and variance. Returns ------- cmvn_features : :class:`~shennong.features.features.Features` The normalized features Raises ------ ValueError If no stats have been accumulated """ # make sure we have accumulated some stats if self.count < 1.0: raise ValueError('insufficient accumulation of stats for CMVN, ' 'must be >= 1.0 but is {}'.format(self.count)) # skip dims in pykaldi is a destructive operation (alteration # of self.stats), so we work by copy here, to avoid modifying # statistics. if not skip_dims: cmvn = self._cmvn else: # make sure all skipped dims are valid dims dmin, dmax = min(skip_dims), max(skip_dims) if dmin < 0 or dmax >= features.ndims: raise ValueError( 'skipped dimensions must be in [0, {}[ but are in [{}, {}[' .format(features.ndims, dmin, dmax)) # work by copy to not alter self.stats cmvn = kaldi.transform.cmvn.Cmvn(dim=self.dim) cmvn.stats = kaldi.matrix.DoubleMatrix(self.stats) cmvn.skip_dims(skip_dims) data = kaldi.matrix.SubMatrix(features.data) cmvn.apply(data, norm_vars=norm_vars, reverse=reverse) return Features(data.numpy(), features.times, properties=self.get_properties(features))
def process(self, audio): """Extracts the (POV, pitch) from a given speech ``audio`` using CREPE. Parameters ---------- audio : Audio The speech signal on which to estimate the pitch. Will be transparently resampled at 16kHz if needed. Returns ------- raw_pitch_features : Features, shape = [nframes, 2] The output array has two columns corresponding to (POV, pitch). The output from the `crepe` module is reshaped to match the specified options `frame_shift` and `frame_length`. Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). """ if audio.nchannels != 1: raise ValueError( f'audio must have one channel but has {audio.nchannels}') if audio.sample_rate != self.sample_rate: self.log.debug('resampling audio to 16 kHz') audio = audio.resample(self.sample_rate) # raw activation matrix, shape=(T, 360) activation = self._get_activation(audio.data) # confidence is the confidence of voice activity, in [, 1], shape=(T,) confidence = activation.max(axis=1) if self.viterbi: cents = _to_viterbi_cents(activation) else: cents = _to_local_average_cents(activation) # frequency is the predicted pitch value in Hz, shape=(T,) and frequency = 10 * 2**(cents / 1200) frequency[np.isnan(frequency)] = 0 # number of samples in the resampled signal hop_length = np.round(self.sample_rate * self.frame_shift).astype(int) nsamples = 1 + int( (audio.shape[0] - self.frame_length * self.sample_rate) / hop_length) # scipy method issues warnings we want to inhibit with warnings.catch_warnings(): warnings.simplefilter('ignore', category=FutureWarning) data = scipy.signal.resample( np.array([confidence, frequency]).T, nsamples) # hack needed beacause resample confidence data[data[:, 0] < 1e-2, 0] = 0 data[data[:, 0] > 1, 0] = 1 return Features(data, self.times(data.shape[0]), properties=self.get_properties())
def process(self, crepe_pitch): """Post process a raw pitch data as specified by the options Parameters ---------- crepe_pitch : Features, shape = [n, 2] The pitch as extracted by the `CrepePitchProcessor.process` method Returns ------- pitch : Features, shape = [n, 1 2 3 or 4] The post-processed pitch usable as speech features. The output columns are 'pov_feature', 'normalized_log_pitch', delta_pitch' and 'raw_log_pitch', in that order,if their respective options are set to True. Raises ------ ValueError If after interpolation some pitch values are not positive. If `raw_pitch` has not exactly two columns. If all the following options are False: 'add_pov_feature', 'add_normalized_log_pitch', 'add_delta_pitch' and 'add_raw_log_pitch' (at least one of them must be True). """ # check at least one required option is True if not (self.add_pov_feature or self.add_normalized_log_pitch or self.add_delta_pitch or self.add_raw_log_pitch): raise ValueError( 'at least one of the following options must be True: ' 'add_pov_feature, add_normalized_log_pitch, ' 'add_delta_pitch, add_raw_log_pitch') if crepe_pitch.shape[1] != 2: raise ValueError( 'data shape must be (_, 2), but it is (_, {})'.format( crepe_pitch.shape[1])) # Interpolate pitch values for unvoiced frames to_remove = predict_voicing(crepe_pitch.data[:, 0]) == 0 if np.all(to_remove): raise ValueError('No voiced frames') data = crepe_pitch.data[:, 1].copy() indexes_to_keep = np.where(~to_remove)[0] first, last = indexes_to_keep[0], indexes_to_keep[-1] first_value, last_value = data[first], data[last] interp = scipy.interpolate.interp1d(indexes_to_keep, data[indexes_to_keep], fill_value='extrapolate') data[to_remove] = interp(np.where(to_remove)[0]) data[:first] = first_value data[last:] = last_value if not np.all(data > 0): raise ValueError('Not all pitch values are positive: issue with ' 'extracted pitch or interpolation') # Converts POV into NCCF nccf = [] for sample in crepe_pitch.data[:, 0]: if sample in [0, 1]: nccf.append(sample) else: nccf.append( scipy.optimize.bisect( functools.partial(lambda x, y: _nccf_to_pov(x) - y, y=sample), 0, 1)) return super(CrepePitchPostProcessor, self).process( Features( np.vstack((nccf, data)).T, crepe_pitch.times, crepe_pitch.properties))
def process(self, utterances, ubm=None, group_by='utterance', njobs=1): """Compute the VTLN warp factors for the given utterances. If the ``by_speaker`` option is set to True before the call to :func:`process()`, the warps are computed on per speaker basis (i.e. each utterance of the same speaker has an identical warp). If ``per_speaker`` is False, the warps are computed on a per-utterance basis. Parameters ---------- utterances : :class:`~shennong.utterances.Utterances` The list of utterances to train the VTLN on. ubm : DiagUbmProcessor, optional If provided, uses this UBM instead of computing a new one. group_by : str, optional Must be 'utterance' or 'speaker'. njobs : int, optional Number of threads to use for computation, default to 1. Returns ------- warps : dict[str, float] Warps computed for each speaker or utterance, according to ``group_by``. If by speaker: same warp for all utterances of this speaker. """ if group_by not in ('utterance', 'speaker'): raise ValueError( f'group_by must be "utterance" or "speaker", ' f'it is: {group_by}') if group_by == 'speaker' and not self.by_speaker: raise ValueError( 'Asking to group warps by speaker but they are computed ' 'per utterance, please set VtlnProcessor.by_speaker to True') if self.by_speaker and not utterances.has_speakers(): raise ValueError( 'Requested speaker based VTLN, but speaker' ' information is missing') utt2speak = None if self.by_speaker: utt2speak = {utt.name: utt.speaker for utt in utterances} # Min / max warp if self.min_warp > self.max_warp: raise ValueError( f'Min warp > max warp: {self.min_warp} > {self.max_warp}') # UBM-GMM if ubm is None: ubm = DiagUbmProcessor(**self.ubm) ubm.log.setLevel(self.log.getEffectiveLevel()) ubm.process(utterances, njobs=njobs) else: if ubm.gmm is None: raise ValueError('Given UBM-GMM has not been trained') self.ubm = ubm.get_params() self.log.info('Initializing base LVTLN transforms') dim = ubm.gmm.dim() num_classes = int(1.5 + (self.max_warp-self.min_warp) / self.warp_step) default_class = int(0.5 + (1-self.min_warp)/self.warp_step) self.lvtln = kaldi.transform.lvtln.LinearVtln.new( dim, num_classes, default_class) cmvn_config = self.features.pop('sliding_window_cmvn', None) raw_mfcc = pipeline.extract_features( self.features, utterances, njobs=njobs, log=null_logger()) # Compute VAD decision self.log.debug('... computing VAD decision') vad = {} for utt, mfcc in raw_mfcc.items(): this_vad = VadPostProcessor(**ubm.vad).process(mfcc) vad[utt] = this_vad.data.reshape( (this_vad.shape[0],)).astype(bool) # Apply cmvn sliding orig_features = FeaturesCollection() if cmvn_config is not None: proc = SlidingWindowCmvnPostProcessor(**cmvn_config) for utt, mfcc in raw_mfcc.items(): orig_features[utt] = proc.process(mfcc) else: orig_features = raw_mfcc # Select voiced frames orig_features = orig_features.trim(vad) orig_features = FeaturesCollection( # Subsample {utt: feats.copy(subsample=self.subsample) for utt, feats in orig_features.items()}) # Computing base transforms featsub_unwarped = pipeline.extract_features( self.features, utterances, njobs=njobs, log=null_logger()).trim(vad) featsub_unwarped = FeaturesCollection( {utt: feats.copy(subsample=self.subsample) for utt, feats in featsub_unwarped.items()}) for c in range(num_classes): this_warp = self.min_warp + c*self.warp_step self.log.info( 'Computing base transform (warp=%s) %s/%s', this_warp, c+1, num_classes) featsub_warped = pipeline.extract_features_warp( self.features, utterances, this_warp, null_logger(), njobs=njobs).trim(vad) featsub_warped = FeaturesCollection( {utt: feats.copy(subsample=self.subsample) for utt, feats in featsub_warped.items()}) self.compute_mapping_transform( featsub_unwarped, featsub_warped, c, this_warp) del featsub_warped, featsub_unwarped, vad if cmvn_config is not None: self.features['sliding_window_cmvn'] = cmvn_config self.log.debug('Computing Gaussian selection info') ubm.gaussian_selection(orig_features) self.log.info( 'Computing LVTLN transforms (%s iterations)', self.num_iters) posteriors = ubm.gaussian_selection_to_post(orig_features) self.transforms, self.warps = self.estimate( ubm, orig_features, posteriors, utt2speak) for i in range(self.num_iters): self.log.debug('Updating model on pass %s/%s', i+1, self.num_iters) # Transform the features features = FeaturesCollection() for utt, feats in orig_features.items(): ind = utt if utt2speak is None else utt2speak[utt] linear_part = self.transforms[ind][:, : feats.ndims] offset = self.transforms[ind][:, feats.ndims] data = np.dot(feats.data, linear_part.numpy().T) + \ offset.numpy() features[utt] = Features(data, feats.times, feats.properties) # Update the model gmm_accs = ubm.accumulate(features, njobs=njobs) ubm.estimate(gmm_accs) # Now update the LVTLN transforms (and warps) # self.log.debug('Re-estimating LVTLN transforms on pass %s', i+1) posteriors = ubm.gaussian_selection_to_post(features) self.transforms, self.warps = self.estimate( ubm, orig_features, posteriors, utt2speak) if self.by_speaker: self.transforms = { utt: self.transforms[spk] for utt, spk in utt2speak.items()} self.warps = { utt: self.warps[spk] for utt, spk in utt2speak.items()} self.log.info('Done training LVTLN model') if group_by == 'utterance': return self.warps # group_by == 'speaker' return { spk: self.warps[utts[0].name] for spk, utts in utterances.by_speaker().items()}