def prepare(self, wavs_dir, keep_short_utts=False): """Prepare the corpus from raw distribution to abkhazia format `wavs_dir` is a directory where to store prepared wav files (as links or files) If `keep_short_utts` is True remove from the corpus all the utterances shorter than 100 ms This method must not be overloaded in child classes as it ensure consistency with the abkhazia format. """ self.log.info('converting %s to abkhazia', self.name) self.log.debug('reading from %s', self.input_dir) # populate the corpus, ensure the wav-ids in segment have a # '.wav' extension c = self.corpus c.segments = { k: (utils.append_ext(v[0], '.wav'), v[1], v[2]) for k, v in self.make_segment().items() } c.wav_folder = self.make_wavs(wavs_dir) c.wavs = {w for w, _, _ in c.segments.values()} c.lexicon = self.make_lexicon() c.text = self.make_transcription() c.utt2spk = self.make_speaker() c.phones = self.phones c.silences = self.silences c.variants = self.variants if not keep_short_utts: size = len(c.utts()) dur = c.utt2duration() c = c.subcorpus([u for u in c.utts() if dur[u] > 0.1 + 1e-8], prune=True, validate=False) self.log.debug('removed %s utterances shorter than 100ms', size - len(c.utts())) self.log.debug("prepared %s utterances", len(c.utts())) return c
def prune(self, prune_lexicon=False): """Removes unregistered utterances from a corpus This method modifies the corpus in place and return None The pruning operation delete undesired data from utterances listed in self.utts(). It removes any segment, text, wav with an unknown utterance id. If prune_lexicon is True, it also prunes the lexicon and phoneset. """ utts = set(self.utts()) # prune utterance indexed dicts from the utterances list for d in (self.segments, self.text, self.utt2spk): d = {key: value for key, value in d.items() if key in utts} # prune wavs from pruned segments self.wavs = {utils.append_ext(w) for w in set(self.wav2utt().keys())} if prune_lexicon: # prune lexicon from pruned text words = self.words(in_lexicon=False) self.lexicon = { key: value for key, value in self.lexicon.items() if key in words } # make sure <unk> is still here (needed by Kaldi programs) self.lexicon['<unk>'] = 'SPN' # prune phones from pruned lexicon phones = set(phone for phones in self.lexicon.values() for phone in phones.split()) self.phones = { key: value for key, value in self.phones.items() if key in phones }
def save_segments(corpus, path, force_timestamps=False): """Save the corpus segments in `path` If force_timestamps is True and segments are without timestamps, create them with the value (0, wav_duration). """ with open_utf8(path, 'w') as out: for k, v in sorted(corpus.segments.items()): # make sure we have the '.wav' extension v = (append_ext(v[0], '.wav'), v[1], v[2]) if v[1] is None: if force_timestamps is True: # abs path to the wav file w = os.path.join(corpus.wav_folder, v[0]) v = u'{} 0.0 {}'.format(v[0], wav.duration(w)) else: v = v[0] else: # we have timestamps v = u'{} {} {}'.format(v[0], v[1], v[2]) out.write(u'{} {}\n'.format(k, v))