Python append_ext Examples

Programming Language: Python

Namespace/Package Name: abkhazia.utils

Method/Function: append_ext

Examples at hotexamples.com: 3

Python append_ext - 3 examples found. These are the top rated real world Python examples of abkhazia.utils.append_ext extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: abstract_preparator.py Project: colincwilson/abkhazia

    def prepare(self, wavs_dir, keep_short_utts=False):
        """Prepare the corpus from raw distribution to abkhazia format

        `wavs_dir` is a directory where to store prepared wav files
          (as links or files)

        If `keep_short_utts` is True remove from the corpus all the
        utterances shorter than 100 ms

        This method must not be overloaded in child classes as it
        ensure consistency with the abkhazia format.

        """
        self.log.info('converting %s to abkhazia', self.name)
        self.log.debug('reading from %s', self.input_dir)

        # populate the corpus, ensure the wav-ids in segment have a
        # '.wav' extension
        c = self.corpus
        c.segments = {
            k: (utils.append_ext(v[0], '.wav'), v[1], v[2])
            for k, v in self.make_segment().items()
        }
        c.wav_folder = self.make_wavs(wavs_dir)
        c.wavs = {w for w, _, _ in c.segments.values()}
        c.lexicon = self.make_lexicon()
        c.text = self.make_transcription()
        c.utt2spk = self.make_speaker()
        c.phones = self.phones
        c.silences = self.silences
        c.variants = self.variants

        if not keep_short_utts:
            size = len(c.utts())
            dur = c.utt2duration()
            c = c.subcorpus([u for u in c.utts() if dur[u] > 0.1 + 1e-8],
                            prune=True,
                            validate=False)
            self.log.debug('removed %s utterances shorter than 100ms',
                           size - len(c.utts()))

        self.log.debug("prepared %s utterances", len(c.utts()))
        return c

Example #2

Show file

File: corpus.py Project: colincwilson/abkhazia

    def prune(self, prune_lexicon=False):
        """Removes unregistered utterances from a corpus

        This method modifies the corpus in place and return None

        The pruning operation delete undesired data from utterances
        listed in self.utts(). It removes any segment, text, wav with
        an unknown utterance id.

        If prune_lexicon is True, it also prunes the lexicon and
        phoneset.
        """
        utts = set(self.utts())

        # prune utterance indexed dicts from the utterances list
        for d in (self.segments, self.text, self.utt2spk):
            d = {key: value for key, value in d.items() if key in utts}

        # prune wavs from pruned segments
        self.wavs = {utils.append_ext(w) for w in set(self.wav2utt().keys())}

        if prune_lexicon:
            # prune lexicon from pruned text
            words = self.words(in_lexicon=False)
            self.lexicon = {
                key: value
                for key, value in self.lexicon.items() if key in words
            }
            # make sure <unk> is still here (needed by Kaldi programs)
            self.lexicon['<unk>'] = 'SPN'

            # prune phones from pruned lexicon
            phones = set(phone for phones in self.lexicon.values()
                         for phone in phones.split())
            self.phones = {
                key: value
                for key, value in self.phones.items() if key in phones
            }

Example #3

Show file

    def save_segments(corpus, path, force_timestamps=False):
        """Save the corpus segments in `path`

        If force_timestamps is True and segments are without
        timestamps, create them with the value (0, wav_duration).

        """
        with open_utf8(path, 'w') as out:
            for k, v in sorted(corpus.segments.items()):
                # make sure we have the '.wav' extension
                v = (append_ext(v[0], '.wav'), v[1], v[2])

                if v[1] is None:
                    if force_timestamps is True:
                        # abs path to the wav file
                        w = os.path.join(corpus.wav_folder, v[0])
                        v = u'{} 0.0 {}'.format(v[0], wav.duration(w))
                    else:
                        v = v[0]

                else:  # we have timestamps
                    v = u'{} {} {}'.format(v[0], v[1], v[2])

                out.write(u'{} {}\n'.format(k, v))