def _run_preparator(cls, args, preparator, output_dir=None): output_dir = (( cls._output_dir(args) if args.output_dir is None else os.path.abspath(os.path.join(args.output_dir, 'data'))) if output_dir is None else output_dir) preparator.log = utils.logger.get_log( os.path.join(output_dir, 'data_preparation.log'), args.verbose) # initialize corpus from raw with it's preparator corpus = preparator.prepare( os.path.join(output_dir, 'wavs'), keep_short_utts=args.keep_short_utts) corpus.log = utils.logger.get_log( os.path.join(output_dir, 'data_validation.log'), args.verbose) # raise if the corpus is not in correct abkhazia # format. Redirect the log to the preparator logger corpus.validate(njobs=args.njobs) # save the corpus to the output directory corpus.save(output_dir, no_wavs=True) # save the alignment if not args.no_alignment: alignment_file = os.path.join(output_dir, 'alignment.txt') utils.open_utf8(alignment_file, 'w').write( '\n'.join( '{} {}'.format(k, ' '.join(str(v) for v in vv)) for k, v in sorted(preparator.alignment.items()) for vv in v) + '\n')
def correct_dictionary(self): """Correct problems with the GlobalPhone Mandarin dictionary The corrections are completely ad hoc the result is stored in a temporary file """ # the following words are in the dictionary but are not used # in the transcriptions they will be dropped words_to_drop = [u'#fragment#', u'#noise#', u'$', u'(', u')', u'SIL'] # correct content correct_lines = [] for line in open_utf8(self.dictionary, 'r').xreadlines(): if all( [not (u'{' + word + u'}' in line) for word in words_to_drop]): line = line.replace(u'{lai2zhe3bu2ju4 }', u'{lai2zhe3bu2ju4}') correct_lines.append(line) # update self.dictionary with the corrected content fid, filename = tempfile.mkstemp() os.close(fid) with open_utf8(filename, 'w') as out: for line in correct_lines: out.write(line) self.dictionary = filename return True
def create(self): """Initialize the recipe data in `self.recipe_dir`""" self.check_parameters() # setup phones self.a2k.setup_phones() self.a2k.setup_silences() self.a2k.setup_variants() # setup lexicon and text depending on word/phone ngram level text = self.a2k.setup_text() lm_text = os.path.join(self.a2k._local_path(), 'lm_text.txt') if self.level == 'word': shutil.copy(text, lm_text) self.a2k.setup_lexicon() else: # phone level with utils.open_utf8(lm_text, 'w') as out: for k, v in sorted(self.corpus.phonemize_text().iteritems()): out.write(u'{} {}\n'.format(k, v)) self.a2k.setup_phone_lexicon() # setup data files common to both levels self.a2k.setup_kaldi_folders() self.a2k.setup_machine_specific_scripts() self._setup_prepare_lang_wpdpl()
def load(cls, path): """Load an ARPA language model from the file `path`""" assert os.path.isfile(path) data = {} order = None for line in (l.strip() for l in utils.open_utf8(path, 'r') if l): if line.startswith('\\data\\'): order = 0 elif line.startswith('\\end\\'): break elif line.startswith('\\') and line.endswith(':'): order = int(re.search('[0-9]+', line).group(0)) if order not in data: data[order] = {} elif line: if order == 0: # still in \data\ section pass elif order > 0: line = line.split('\t') prob = float(line[0]) ngram = tuple(line[1].split()) backoff = None if len(line) <= 2 else float(line[2]) data[order][ngram] = (prob, backoff) else: raise IOError( 'unable to parse ARPA file line: {}'.format(line)) return cls(data)
def load_variants(path): """Return a list of variant symbols `path` is assumed to be a variants file, usually named 'variants.txt'. """ return [line.strip() for line in utils.open_utf8(path, 'r')]
def load_segments(path): """Return a dict of utterance ids mapped to (wav, tbegin, tend) and the set of all required wav files `path` is assumed to be a segments file, usually named 'segments.txt'. If there is only one utterance per wav, tbegin and tend are None. Append the '.wav' extension to the segments wav-ids if they are missing. """ def _wav_tuple(l): wav = l[0] if os.path.splitext(wav)[1] != '.wav': wav += '.wav' return ((wav, None, None) if len(l) == 1 else (wav, float(l[1]), float(l[2]))) lines = (line.strip().split() for line in utils.open_utf8(path, 'r')) segments = {line[0]: _wav_tuple(line[1:]) for line in lines} wavs = {w[0] for w in segments.values()} return segments, wavs
def load_silences(path): """Return a list of silence symbols `path` is assumed to be a silences file, usually named 'silences.txt'. """ return [line.strip() for line in utils.open_utf8(path, 'r')]
def save(self, path): """Save a language model to `path` in the ARPA format Do not write empty ngrams to the `path` """ with utils.open_utf8(path, 'w') as fp: # write header fp.write('\n\\data\\\n') for order in range(1, self.order+1): size = len(self.ngrams[order]) if size: fp.write('ngram {}={}\n'.format(order, size)) fp.write('\n') # write ngrams for order in range(1, self.order+1): if len(self.ngrams[order]): fp.write('\\{}-grams:\n'.format(order)) for k, v in sorted(self.ngrams[order].items()): ngram = ' '.join(k) if v[1] is None: # no backoff fp.write(u'{}\t{}\n'.format(v[0], ngram)) else: fp.write(u'{}\t{}\t{}\n'.format(v[0], ngram, v[1])) fp.write('\n') fp.write('\\end\\\n')
def make_lexicon(self): # parse dictionary lines words, transcripts = [], [] for line in utils.open_utf8(self.dictionary, 'r').readlines(): # suppress linebreaks (this does not take into account fancy # unicode linebreaks), see # http://stackoverflow.com/questions/3219014 line = re.sub(u'\r\n?|\n', u'', line).split(u' ') # parse word word = self.strip_accolades(line[0]) if u'{' in word or u'}' in word: raise RuntimeError('Bad formatting of word {}'.format(word)) words.append(word) # parse phonetic transcription trs = self.strip_accolades(u' '.join(line[1:])).split(u' ') transcript = [] for phone in trs: if phone[0] == u'{': phn = phone[1:] assert phn != u'WB', trs elif phone[-1] == u'}': phn = phone[:-1] assert phn == u'WB', trs else: phn = phone assert phn != u'WB', trs assert not (u'{' in phn), trs assert not (u'}' in phn), trs if phn != u'WB': transcript.append(phn) transcripts.append(u' '.join(transcript)) return dict(zip(words, transcripts))
def test_align( corpus, features, lm_word, am_mono, tmpdir, level, post): output_dir = str(tmpdir.mkdir('align-phones')) flog = os.path.join(output_dir, 'align-phones.log') log = utils.logger.get_log(flog) aligner = align.Align(corpus, output_dir=output_dir, log=log) aligner.feat_dir = features aligner.lm_dir = lm_word aligner.am_dir = am_mono aligner.level = level aligner.with_posteriors = post if level == 'words' and post: with pytest.raises(NotImplementedError): aligner.compute() else: aligner.compute() # check if we have no error and an alignment file assert_no_expr_in_log(flog, 'error') ali_file = os.path.join(output_dir, 'alignment.txt') assert os.path.isfile(ali_file) if not post: res = [l.strip() for l in utils.open_utf8(ali_file, 'r') if l.startswith('s0102a-sent17')] assert res == expected_ali[level]
def setup_wav(self): """Create wav.scp in data directory""" target = os.path.join(self._output_path(), 'wav.scp') wavs = set(w for w, _, _ in self.corpus.segments.itervalues()) with open_utf8(target, 'w') as out: for wav in sorted(wavs): wav_path = os.path.join(self.corpus.wav_folder, wav) out.write(u'{} {}\n'.format(wav, wav_path))
def _compute_lm(self, G_arpa): """Generate an ARPA n-gram from an abkhazia corpus This method relies on the following Kaldi programs: add-start-end.sh, build-lm.sh and compile-lm. It uses the IRSTLM library. """ self.log.info('computing %s %s-gram in ARPA format', self.level, self.order) # cut -d' ' -f2 lm_text > text_ready. Train need to # remove utt-id on first column of text file lm_text = os.path.join(self.a2k._local_path(), 'lm_text.txt') lm_lines = utils.open_utf8(lm_text, 'r').readlines() text_ready = os.path.join(self.a2k._local_path(), 'text_ready.txt') with utils.open_utf8(text_ready, 'w') as ready: ready.write('\n'.join( [' '.join(line.split()[1:]) for line in lm_lines])) text_se = os.path.join(self.a2k._local_path(), 'text_se.txt') utils.jobs.run('add-start-end.sh', stdin=open(text_ready, 'r'), stdout=open(text_se, 'w').write, env=kaldi_path(), cwd=self.recipe_dir) assert os.path.isfile(text_se), 'LM failed on add-start-end' # k option is number of split, useful for huge text files # build-lm.sh in kaldi/tools/irstlm/bin text_lm = os.path.join(self.a2k._local_path(), 'text_lm.gz') self._run_command( 'build-lm.sh -i {0} -n {1} -o {2} -k 1 -s kneser-ney'.format( text_se, self.order, text_lm)) assert os.path.isfile(text_lm), 'LM failed on build-lm' text_blm = os.path.join(self.a2k._local_path(), 'text_blm.gz') self._run_command( # was with the -i option 'compile-lm {} --text=yes {}'.format(text_lm, text_blm)) # gzip the compiled lm (from # https://docs.python.org/2/library/gzip.html#examples-of-usage) with open(text_blm, 'rb') as fin, gzip.open(G_arpa, 'wb') as fout: shutil.copyfileobj(fin, fout)
def load_lexicon(path): """Return a dict of word to phones entries loaded from `path` `path` is assumed to be a lexicon file, usually named 'lexicon.txt' """ lines = (line.strip().split() for line in utils.open_utf8(path, 'r')) return {line[0]: ' '.join(line[1:]) for line in lines}
def load_phones(path): """Return a dict of phones mapped to their IPA equivalent `path` is assumed to be a phones file, usually named 'phones.txt'. """ lines = (line.strip().split() for line in utils.open_utf8(path, 'r')) return {line[0]: line[1] for line in lines}
def load_text(path): """Return a dict of utterance ids mapped to their textual content `path` is assumed to be a text file, usually named 'text.txt'. """ lines = (line.strip().split() for line in utils.open_utf8(path, 'r')) return {line[0]: ' '.join(line[1:]) for line in lines}
def __init__(self, input_dir, log=utils.logger.null_logger(), cmu_dict=None): super(WallStreetJournalPreparator, self).__init__( input_dir, log=log, cmu_dict=cmu_dict) # select only a subpart of recordings and transcriptions. # Listing files using the following 2 criterions: 1- files are # nested within self.directory_pattern and 2- the 4th letter # in the file name is self.file_pattern self.log.debug('directory pattern is {}, file pattern is {}' .format(self.directory_pattern, self.file_pattern)) # setup directory filter if self.directory_pattern is None: dir_filter = lambda d: True else: dir_filter = lambda d: d in self.directory_pattern # setup file pattern if self.file_pattern is None: filter_dot = lambda f: f[-4:] == '.dot' filter_wv1 = lambda f: f[-4:] == '.wv1' else: filter_dot = lambda f: ( f[3] == self.file_pattern and f[-4:] == '.dot') filter_wv1 = lambda f: ( f[3] == self.file_pattern and f[-4:] == '.wv1') # filter out the non desired input files self.input_recordings = self.filter_files(dir_filter, filter_wv1) self.input_transcriptions = self.filter_files(dir_filter, filter_dot) self.log.debug('selected {} speech files and {} transcription files' .format(len(self.input_recordings), len(self.input_transcriptions))) # filter out the corrupted utterances from input files. The # tag '[bad_recording]' in a transcript indicates a problem # with the associated recording (if it exists) so exclude it self.bad_utts = [] for trs in self.input_transcriptions: for line in utils.open_utf8(trs, 'r').xreadlines(): if '[bad_recording]' in line: utt_id = re.match(r'(.*) \((.*)\)', line).group(2) self.bad_utts.append(utt_id) self.log.debug('found {} corrupted utterances' .format(len(self.bad_utts))) # filter out bad utterances self.sphs = [sph for sph in self.input_recordings if (os.path.basename(sph).replace('.wv1', '') not in self.bad_utts and os.path.basename(sph).replace('.wv1', '') not in self.exclude_wavs)]
def setup_silences(self): """Create data/local/self.name/{silences, optional_silence}.txt""" local_path = self._local_path() CorpusSaver.save_silences( self.corpus, os.path.join(local_path, 'silence_phones.txt')) target = os.path.join(local_path, 'optional_silence.txt') with open_utf8(target, 'w') as out: out.write(u'SIL\n')
def setup_utt2spk(self): """Create utt2spk and spk2utt in data directory""" target = os.path.join(self._output_path(), 'utt2spk') CorpusSaver.save_utt2spk(self.corpus, target) # create spk2utt target = os.path.join(self._output_path(), 'spk2utt') with open_utf8(target, 'w') as out: for spk, utt in sorted(self.corpus.spk2utt().iteritems()): out.write(u'{} {}\n'.format(spk, ' '.join(sorted(utt))))
def correct_dictionary(self): """Correct problems with the GlobalPhone Mandarin dictionary The corrections are completely ad hoc the result is stored in a temporary file """ # the following words are in the dictionary but are not used # in the transcriptions they will be dropped words_to_drop = [u'$', u'(', u')'] # read input file with utils.open_utf8(self.dictionary, 'r') as inp: lines = inp.readlines() # generate output file fid, corrected_dictionary = tempfile.mkstemp() os.close(fid) # correct content with utils.open_utf8(corrected_dictionary, 'w') as out: for line in lines: # skip secondary pronunciations if u'(2)' not in line: # skip some words if all([(u'{'+word+u'}' not in line) for word in words_to_drop]): # rewrite tone markers in a manner consistent with # GlobalPhone Mandarin pinyin markings line = line.replace(u'WB ', u'WB') line = line.replace(u' ', u' ') # ttd and t.t.d have wrongly formatted # transcriptions line = line.replace(u'{{t}', u'{t') line = re.sub(r'\{(\w*) T(\d)\}', u'\\1_\\2', line) line = re.sub(r'\{(\w*) T(\d) WB\}', u'{\\1_\\2 WB}', line) out.write(line) self.dictionary = corrected_dictionary return True
def setup_phone_lexicon(self): """Create data/local/self.name/lexicon.txt""" local_path = self._local_path() target = os.path.join(local_path, 'lexicon.txt') # get list of phones (including silence and non-silence phones) phones = [] for origin in (os.path.join(local_path, 'silence_phones.txt'), os.path.join(local_path, 'nonsilence_phones.txt')): phones += [line.strip() for line in open_utf8(origin, 'r')] # create 'phone' lexicon with open_utf8(target, 'w') as out: for word in phones: out.write(u'{0} {0}\n'.format(word)) # add <unk> word, in case one wants to use the phone loop # lexicon for training it also is necessary if one doesn't # want to modify the validating scripts too much out.write(u'<unk> SPN\n') return target
def correct_transcription(self): """Correct problems with the GlobalPhone Vietnamese transcripts The corrections are completely ad hoc and the result are stored in a temporary folder. - remove trailings spaces and all double spacings and '_' from transcriptions on every odd line but the first - double spacings and '_' are actually only found for speakers 200 to 208 """ # generate temporary output folder corrected_transcription_dir = tempfile.mkdtemp() # get the list of transcription files trss = utils.list_directory(self.transcription_dir, abspath=True) for trs in trss: # read transcript file lines = utils.open_utf8(trs, 'r').readlines() # correct odd lines lines[2::2] = [line.replace(u'_', u' ').replace( u' ', u' ').strip() + u'\n' for line in lines[2::2]] # # write corrected version to temp output_file = os.path.join( corrected_transcription_dir, os.path.basename(trs)) with utils.open_utf8(output_file, 'w') as out: for line in lines: out.write(line) self.transcription_dir = corrected_transcription_dir return True
def nadults(cha): """Return the number of adult speakers recorded in `cha`""" # speaker IDs, lines starting with '@ID', forced to lowercase spks = (l.strip().lower() for l in open_utf8(cha, 'r') if l.startswith('@ID')) # exclude non-adult speakers exclude = [ 'sibl', 'broth', 'sist', 'target', 'child', 'to', 'environ', 'cousin', 'non_hum', 'play' ] # count non-excluded speakers return sum(all(e not in spk for e in exclude) for spk in spks)
def make_transcription(self): text = dict() for trs in utils.list_directory(self.transcription_dir, abspath=True): spk_id = os.path.splitext(os.path.basename(trs))[0] lines = utils.open_utf8(trs, 'r').readlines() # add utterence id from even lines starting at line 2 ids = [ spk_id + u'_' + re.sub(ur'\s+|:|;', u'', e) for e in lines[1::2] ] # delete linebreaks on odd lines starting at line 3 # (this does not take into account fancy unicode # linebreaks), see # http://stackoverflow.com/questions/3219014 transcriptions = [
def parse_kana_to_phone(self, kana_csv): """Parse katakana phone transcription and pu it in a dict() """ kana_to_phon = dict() with open_utf8(kana_csv, 'r') as fin: kana_transcript = fin.read() kana_transcript = kana_transcript.split('\n') for line in kana_transcript[1:]: if line == '': continue phones = line.split('\t') katakana = phones[0].decode('utf8') bootphon = phones[3] if bootphon == '': bootphon == "H" kana_to_phon[katakana] = bootphon return (kana_to_phon)
def export(self): int2phone = read_int2phone(self.lm_dir) ali = self._read_result_utts('ali') post = self._read_result_utts('post') if self.with_posteriors else None # retrieve the export function according to `level` func = {'phones': self._export_phones, 'words': self._export_words, 'both': self._export_phones_and_words}[self.level] aligned = func(int2phone, ali, post) # write it to the target file target = os.path.join(self.output_dir, 'alignment.txt') with utils.open_utf8(target, 'w') as out: out.write('\n'.join(line.strip() for line in aligned) + '\n') super(Align, self).export()
def model_type(am_dir): """Return the type of the trained model Read in meta.txt, or raise IOError if not found TODO could be parsed from final.mdl ? """ meta = os.path.join(am_dir, 'meta.txt') if not os.path.isfile(meta): raise IOError('file not found: {}'.format(meta)) for line in utils.open_utf8(meta, 'r'): if line.startswith('acoustic model type'): return line.split(':')[1].strip() raise IOError('acoustic model type not found in {}'.format(meta))
def make_lexicon(self): lexicon = dict() for line in utils.open_utf8(self.cmu_dict, 'r').readlines(): # remove newline and trailing spaces line = line.strip() # skip comments if not (len(line) >= 3 and line[:3] == u';;;'): # parse line word, phones = line.split(u' ') # skip alternative pronunciations, the first one # (with no parenthesized number at the end) is # supposed to be the most common and is retained if not re.match(ur'(.*)\([0-9]+\)$', word): # ignore stress variants of phones lexicon[word] = re.sub(u'[0-9]+', u'', phones).strip()
def _change_lm_vocab(self, lm_txt, words_txt): """Create a LM from an existing one by changing its vocabulary All n-grams in the new vocab are retained with their original probabilities. Backoff weights are recomputed and backed-off unigrams for all new words are added. -subset option performs subsetting of the vocabulary without adding new words. This is reimplementation of the change-lm-vocab script from SRILM, modified in 3 ways: - no more -tolower option - the pruning step now updates ngrams count in the header (disable annoying warning) - the call to ngram is done in 2 steps if the 1 step failed (-renorm and prune-lowprobs options failed together on -librispeech-test-clean, need 2 calls) """ out_lm = os.path.join(self.output_dir, 'out_lm.txt') self.log.debug('pruning vocabulary in %s', out_lm) words = set(w.split()[0] for w in utils.open_utf8(words_txt, 'r')) lm = ARPALanguageModel.load(lm_txt) lm.prune_vocabulary(words) lm_pruned = lm_txt + '.pruned' lm.save(lm_pruned) try: self._run_command( 'ngram -lm {0} -vocab /dev/null -renorm -write-lm {1} ' '-prune-lowprobs -unk -order {2}'.format( lm_pruned, out_lm, self.order)) except RuntimeError: self._run_command('utils/run.pl {3} ' 'ngram -lm {0} -vocab /dev/null -write-lm - ' '-prune-lowprobs -unk -order {2} | ' 'ngram -lm - -vocab /dev/null -renorm -unk ' '-order {2} -write-lm {1}'.format( lm_pruned, out_lm, self.order, os.path.join(self.output_dir, 'ngram.log'))) return out_lm
def read_int2phone(lm_dir, word_position_dependent=True): """Return a int to phone mapping as a dict Kaldi internally codes phones as ints, so this method reverses the mapping from ints to phones based on the phones.txt file in `lm_dir`. This file is assumed to exist. """ phonemap = dict() for line in utils.open_utf8(os.path.join(lm_dir, 'phones.txt'), 'r'): phone, code = line.strip().split(' ') # remove word position markers if word_position_dependent and phone[-2:] in ['_I', '_B', '_E', '_S']: phone = phone[:-2] phonemap[code] = phone return phonemap
def _parse_chas(self, chas, exclude_spks): """Extract cleaned utterances from raw cha files Return a dict of utterances where keys are the utterances id and values are Utterance named tuples. a) the ones considered will be only the ones marked with time-stamps b) whose timestamps correspond to wav duration c) utterances that are not empty """ self.log.info('parsing %s cha files...', len(chas)) utts = {} for cha, wav in chas.items(): # duration of the wav in millisecond duration = utils.wav.duration(wav) # get cleaned utterances from the raw cha file. At that # point timestamps are the last word of each line. text = utils.cha.clean( l.strip() for l in utils.open_utf8(cha, 'r') if not re.search(exclude_spks, l)) cha_id = os.path.splitext(os.path.basename(cha))[0] counter = 0 for words in (t.split() for t in text): if len(words) > 1: # remove empty utterances # parsing the timestamps timestamp = words[-1].split('_') tbegin = int(timestamp[0])/1000. tend = int(timestamp[1])/1000. # reject utterances with out of boundaries # timestamps if not (tbegin > duration or tend > duration): counter += 1 utt_id = cha_id + '-sent' + str(counter) utts[utt_id] = self.Utterance( ' '.join(words[:-1]), os.path.basename(wav), tbegin, tend) return utts