def prepare(self, wavs_dir, keep_short_utts=False):
        """Prepare the corpus from raw distribution to abkhazia format

        `wavs_dir` is a directory where to store prepared wav files
          (as links or files)

        If `keep_short_utts` is True remove from the corpus all the
        utterances shorter than 100 ms

        This method must not be overloaded in child classes as it
        ensure consistency with the abkhazia format.

        """
        self.log.info('converting %s to abkhazia', self.name)
        self.log.debug('reading from %s', self.input_dir)

        # populate the corpus, ensure the wav-ids in segment have a
        # '.wav' extension
        c = self.corpus
        c.segments = {
            k: (utils.append_ext(v[0], '.wav'), v[1], v[2])
            for k, v in self.make_segment().items()
        }
        c.wav_folder = self.make_wavs(wavs_dir)
        c.wavs = {w for w, _, _ in c.segments.values()}
        c.lexicon = self.make_lexicon()
        c.text = self.make_transcription()
        c.utt2spk = self.make_speaker()
        c.phones = self.phones
        c.silences = self.silences
        c.variants = self.variants

        if not keep_short_utts:
            size = len(c.utts())
            dur = c.utt2duration()
            c = c.subcorpus([u for u in c.utts() if dur[u] > 0.1 + 1e-8],
                            prune=True,
                            validate=False)
            self.log.debug('removed %s utterances shorter than 100ms',
                           size - len(c.utts()))

        self.log.debug("prepared %s utterances", len(c.utts()))
        return c
Example #2
0
    def prune(self, prune_lexicon=False):
        """Removes unregistered utterances from a corpus

        This method modifies the corpus in place and return None

        The pruning operation delete undesired data from utterances
        listed in self.utts(). It removes any segment, text, wav with
        an unknown utterance id.

        If prune_lexicon is True, it also prunes the lexicon and
        phoneset.
        """
        utts = set(self.utts())

        # prune utterance indexed dicts from the utterances list
        for d in (self.segments, self.text, self.utt2spk):
            d = {key: value for key, value in d.items() if key in utts}

        # prune wavs from pruned segments
        self.wavs = {utils.append_ext(w) for w in set(self.wav2utt().keys())}

        if prune_lexicon:
            # prune lexicon from pruned text
            words = self.words(in_lexicon=False)
            self.lexicon = {
                key: value
                for key, value in self.lexicon.items() if key in words
            }
            # make sure <unk> is still here (needed by Kaldi programs)
            self.lexicon['<unk>'] = 'SPN'

            # prune phones from pruned lexicon
            phones = set(phone for phones in self.lexicon.values()
                         for phone in phones.split())
            self.phones = {
                key: value
                for key, value in self.phones.items() if key in phones
            }
Example #3
0
    def save_segments(corpus, path, force_timestamps=False):
        """Save the corpus segments in `path`

        If force_timestamps is True and segments are without
        timestamps, create them with the value (0, wav_duration).

        """
        with open_utf8(path, 'w') as out:
            for k, v in sorted(corpus.segments.items()):
                # make sure we have the '.wav' extension
                v = (append_ext(v[0], '.wav'), v[1], v[2])

                if v[1] is None:
                    if force_timestamps is True:
                        # abs path to the wav file
                        w = os.path.join(corpus.wav_folder, v[0])
                        v = u'{} 0.0 {}'.format(v[0], wav.duration(w))
                    else:
                        v = v[0]

                else:  # we have timestamps
                    v = u'{} {} {}'.format(v[0], v[1], v[2])

                out.write(u'{} {}\n'.format(k, v))