Ejemplo n.º 1
0
    def process(self, utterances, njobs=1):
        """Initialize the GMM, which sets the means to random data points and
        then does some iterations of EM. Train for a few iterations in parallel

        Parameters
        ----------
        utterances : :class:`~shennong.utterances.Utterances`
            The list of utterances to train the VTLN on.
        njobs : int, optional
            Number of threads to use for computation, default to 1.

        Raises
        ------
        ValueError
            On errors

        """
        cmvn = self.features.pop('sliding_window_cmvn', None)
        self.log.info('Training UBM using %s jobs', njobs)
        raw_features = pipeline.extract_features(self.features,
                                                 utterances,
                                                 njobs=njobs,
                                                 log=null_logger())

        # Compute VAD decision
        vad = {}
        for utt, mfcc in raw_features.items():
            this_vad = VadPostProcessor(**self.vad).process(mfcc)
            vad[utt] = this_vad.data.reshape(
                (this_vad.shape[0], )).astype(bool)

        # Apply cmvn sliding
        features = FeaturesCollection()
        if cmvn is not None:
            proc = SlidingWindowCmvnPostProcessor(**cmvn)
            for utt, mfcc in raw_features.items():
                features[utt] = proc.process(mfcc)
            self.features['sliding_window_cmvn'] = cmvn
        else:
            features = raw_features

        # Select voiced frames
        features = features.trim(vad)

        self.initialize_gmm(features, njobs=njobs)
        self.log.info('Training for %s iterations', self.num_iters)
        features = FeaturesCollection(  # Subsample features collection
            {utt: feats.copy(subsample=self.subsample)
             for utt, feats in features.items()})

        remove_low_count_gaussians = self.remove_low_count_gaussians
        self.remove_low_count_gaussians = False

        for i in range(self.num_iters):
            self.log.debug('Training pass %s', i + 1)
            gmm_accs = self.accumulate(features, njobs=njobs)
            if i == self.num_iters - 1:
                self.remove_low_count_gaussians = remove_low_count_gaussians
            self.estimate(gmm_accs)
        self.log.info("Done training UBM.")
Ejemplo n.º 2
0
def main():
    # parse input arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'data_directory', help='input/output data directory', type=pathlib.Path)
    parser.add_argument(
        'config_file', help='YAML configuration file', type=pathlib.Path)
    parser.add_argument(
        'corpus', choices=['english', 'xitsonga'], help='corpus to process')
    parser.add_argument(
        '--do-vtln', action='store_true',
        help='extract warped features from pre-trained VTLN')
    parser.add_argument(
        '-j', '--njobs', type=int, default=4, metavar='<int>',
        help='number of parallel jobs (default to %(default)s)')
    parser.add_argument(
        '-v', '--verbose', action='store_true', help='increase log level')
    args = parser.parse_args()

    # check and setup arguments
    data_directory = args.data_directory
    if not data_directory.is_dir():
        raise ValueError(f'directory not found: {data_directory}')

    config = args.config_file
    if not config.is_file():
        raise ValueError(f'file not found: {config}')

    warps = None
    if args.do_vtln:
        warps_file = data_directory / f'{args.corpus}.warps'
        if not warps_file.is_file():
            raise ValueError(f'file not found: {warps_file}')
        warps = {spk: float(warp) for spk, warp in (
            line.strip().split() for line in open(warps_file, 'r'))}

    (data_directory / 'features').mkdir(exist_ok=True)

    log = get_logger('extraction', 'debug' if args.verbose else 'info')

    # load input utterances
    log.info('loading utterances...')
    utterances = Utterances(
        [line.strip().split(' ') for line in open(
            data_directory / f'{args.corpus}.utts', 'r')])

    # extract the features
    features = pipeline.extract_features(
        config, utterances, warps=warps, njobs=args.njobs, log=log)

    # save them
    h5f_file = data_directory / 'features' / f'{args.corpus}_{config.stem}.h5f'
    if args.do_vtln:
        h5f_file = h5f_file.replace('.h5f', '_vtln.h5f')

    features.save(h5f_file)
Ejemplo n.º 3
0
def command_extract(args):
    """Execute the 'speech-features extract' command"""
    # setup the logger (level given by -q/-v arguments)
    if args.quiet:
        log = utils.null_logger()
        level = 'error'
    else:
        if args.verbose == 0:
            level = 'warning'
        elif args.verbose == 1:
            level = 'info'
        else:  # verbose >= 2
            level = 'debug'
        log = logger.get_logger(name='speech-features', level=level)

    # make sure the output file is not already existing and have a
    # valid extension
    output_file = args.output_file
    if os.path.exists(output_file):
        log.error('output file already exist: %s', output_file)
        return
    output_ext = os.path.splitext(output_file)[1]
    if output_ext not in supported_extensions().keys():
        log.error(
            'output file has an unsupported extension "%s", must be in %s',
            output_ext, ", ".join(supported_extensions().keys()))
        return

    # make sure the input config and utterances exists
    for filename in (args.config, args.utterances):
        if not os.path.exists(filename):
            log.error('input file not found: %s', filename)

    # read the utterances file
    utterances = Utterances.load(args.utterances)

    # run the pipeline
    features = pipeline.extract_features(args.config,
                                         utterances,
                                         njobs=args.njobs,
                                         log=log)

    # save the features
    log.info('saving the features to %s', output_file)
    features.save(output_file)
Ejemplo n.º 4
0
def main():
    # parse input arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('data_directory',
                        type=pathlib.Path,
                        help='input/output data directory')
    parser.add_argument('conf',
                        choices=['only', 'nocmvn', 'full'],
                        help='pipeline configuration')
    parser.add_argument('warps', type=pathlib.Path, help='VTLN warps to use')
    parser.add_argument('-o',
                        '--output-file',
                        type=pathlib.Path,
                        help='features file')
    parser.add_argument(
        '-j',
        '--njobs',
        type=int,
        default=4,
        metavar='<int>',
        help='number of parallel jobs (default to %(default)s)')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='increase log level')
    args = parser.parse_args()

    # check and setup arguments
    data_directory = args.data_directory
    if not data_directory.is_dir():
        raise ValueError(f'directory not found: {data_directory}')

    config = data_directory / 'config' / f'mfcc_{args.conf}.yaml'
    if not config.is_file():
        raise ValueError(f'file not found: {config}')

    warps = None
    if args.warps.is_file():
        warps = {
            spk: float(warp)
            for spk, warp in (line.strip().split()
                              for line in open(args.warps, 'r'))
        }
    else:
        # the case without VTLN
        assert str(args.warps) == 'off'

    log = get_logger('extraction', 'debug' if args.verbose else 'info')

    # load input utterances
    log.info('loading utterances...')
    utterances = Utterances.load(data_directory / 'english.utts')

    # extract the features
    features = pipeline.extract_features(config,
                                         utterances,
                                         warps=warps,
                                         njobs=args.njobs,
                                         log=log)

    # save them
    (args.output_file.parent).mkdir(exist_ok=True, parents=True)
    features.save(args.output_file)
Ejemplo n.º 5
0
    def process(self, utterances, ubm=None, group_by='utterance', njobs=1):
        """Compute the VTLN warp factors for the given utterances.

        If the ``by_speaker`` option is set to True before the call to
        :func:`process()`, the warps are computed on per speaker basis (i.e.
        each utterance of the same speaker has an identical warp). If
        ``per_speaker`` is False, the warps are computed on a per-utterance
        basis.

        Parameters
        ----------
        utterances : :class:`~shennong.utterances.Utterances`
            The list of utterances to train the VTLN on.
        ubm : DiagUbmProcessor, optional
            If provided, uses this UBM instead of computing a new one.
        group_by : str, optional
            Must be 'utterance' or 'speaker'.
        njobs : int, optional
            Number of threads to use for computation, default to 1.

        Returns
        -------
        warps : dict[str, float]
            Warps computed for each speaker or utterance, according to
            ``group_by``. If by speaker: same warp for all utterances of this
            speaker.

        """
        if group_by not in ('utterance', 'speaker'):
            raise ValueError(
                f'group_by must be "utterance" or "speaker", '
                f'it is: {group_by}')
        if group_by == 'speaker' and not self.by_speaker:
            raise ValueError(
                'Asking to group warps by speaker but they are computed '
                'per utterance, please set VtlnProcessor.by_speaker to True')
        if self.by_speaker and not utterances.has_speakers():
            raise ValueError(
                'Requested speaker based VTLN, but speaker'
                ' information is missing')

        utt2speak = None
        if self.by_speaker:
            utt2speak = {utt.name: utt.speaker for utt in utterances}

        # Min / max warp
        if self.min_warp > self.max_warp:
            raise ValueError(
                f'Min warp > max warp: {self.min_warp} > {self.max_warp}')

        # UBM-GMM
        if ubm is None:
            ubm = DiagUbmProcessor(**self.ubm)
            ubm.log.setLevel(self.log.getEffectiveLevel())
            ubm.process(utterances, njobs=njobs)
        else:
            if ubm.gmm is None:
                raise ValueError('Given UBM-GMM has not been trained')
            self.ubm = ubm.get_params()

        self.log.info('Initializing base LVTLN transforms')
        dim = ubm.gmm.dim()
        num_classes = int(1.5 + (self.max_warp-self.min_warp) / self.warp_step)
        default_class = int(0.5 + (1-self.min_warp)/self.warp_step)
        self.lvtln = kaldi.transform.lvtln.LinearVtln.new(
            dim, num_classes, default_class)

        cmvn_config = self.features.pop('sliding_window_cmvn', None)

        raw_mfcc = pipeline.extract_features(
            self.features, utterances, njobs=njobs, log=null_logger())

        # Compute VAD decision
        self.log.debug('... computing VAD decision')
        vad = {}
        for utt, mfcc in raw_mfcc.items():
            this_vad = VadPostProcessor(**ubm.vad).process(mfcc)
            vad[utt] = this_vad.data.reshape(
                (this_vad.shape[0],)).astype(bool)

        # Apply cmvn sliding
        orig_features = FeaturesCollection()
        if cmvn_config is not None:
            proc = SlidingWindowCmvnPostProcessor(**cmvn_config)
            for utt, mfcc in raw_mfcc.items():
                orig_features[utt] = proc.process(mfcc)
        else:
            orig_features = raw_mfcc

        # Select voiced frames
        orig_features = orig_features.trim(vad)
        orig_features = FeaturesCollection(  # Subsample
            {utt: feats.copy(subsample=self.subsample)
             for utt, feats in orig_features.items()})

        # Computing base transforms
        featsub_unwarped = pipeline.extract_features(
            self.features, utterances,
            njobs=njobs, log=null_logger()).trim(vad)
        featsub_unwarped = FeaturesCollection(
            {utt: feats.copy(subsample=self.subsample)
             for utt, feats in featsub_unwarped.items()})

        for c in range(num_classes):
            this_warp = self.min_warp + c*self.warp_step
            self.log.info(
                'Computing base transform (warp=%s) %s/%s',
                this_warp, c+1, num_classes)

            featsub_warped = pipeline.extract_features_warp(
                self.features, utterances, this_warp,
                null_logger(), njobs=njobs).trim(vad)
            featsub_warped = FeaturesCollection(
                {utt: feats.copy(subsample=self.subsample)
                 for utt, feats in featsub_warped.items()})
            self.compute_mapping_transform(
                featsub_unwarped, featsub_warped, c, this_warp)

        del featsub_warped, featsub_unwarped, vad

        if cmvn_config is not None:
            self.features['sliding_window_cmvn'] = cmvn_config

        self.log.debug('Computing Gaussian selection info')
        ubm.gaussian_selection(orig_features)

        self.log.info(
            'Computing LVTLN transforms (%s iterations)', self.num_iters)
        posteriors = ubm.gaussian_selection_to_post(orig_features)
        self.transforms, self.warps = self.estimate(
            ubm, orig_features, posteriors, utt2speak)

        for i in range(self.num_iters):
            self.log.debug('Updating model on pass %s/%s', i+1, self.num_iters)

            # Transform the features
            features = FeaturesCollection()
            for utt, feats in orig_features.items():
                ind = utt if utt2speak is None else utt2speak[utt]
                linear_part = self.transforms[ind][:, : feats.ndims]
                offset = self.transforms[ind][:, feats.ndims]
                data = np.dot(feats.data, linear_part.numpy().T) + \
                    offset.numpy()
                features[utt] = Features(data, feats.times, feats.properties)

            # Update the model
            gmm_accs = ubm.accumulate(features, njobs=njobs)
            ubm.estimate(gmm_accs)

            # Now update the LVTLN transforms (and warps)
            # self.log.debug('Re-estimating LVTLN transforms on pass %s', i+1)
            posteriors = ubm.gaussian_selection_to_post(features)
            self.transforms, self.warps = self.estimate(
                ubm, orig_features, posteriors, utt2speak)

        if self.by_speaker:
            self.transforms = {
                utt: self.transforms[spk]
                for utt, spk in utt2speak.items()}
            self.warps = {
                utt: self.warps[spk]
                for utt, spk in utt2speak.items()}

        self.log.info('Done training LVTLN model')
        if group_by == 'utterance':
            return self.warps
        # group_by == 'speaker'
        return {
            spk: self.warps[utts[0].name]
            for spk, utts in utterances.by_speaker().items()}