def dict_to_ark(arkfile, data, format='text'): """Write a data dictionary to a Kaldi ark file TODO for now time information from h5f is lost in ark Parameters: ----------- arkfile (str): path to the ark file to write data (dict): dictionary of numpy arrays to write format (str): must be 'text' or 'binary' to write a text or a binary ark file respectively, default is 'text' Raise: ------ RuntimeError if format is not 'text' or 'binary' """ if format == 'text': _dict_to_txt_ark(arkfile, data) elif format == 'binary': with tempfile.NamedTemporaryFile( dir=utils.config.get('abkhazia', 'tmp-directory')) as tmp: _dict_to_txt_ark(tmp.name, data) utils.jobs.run('copy-feats ark,t:{0} ark:{1}'.format( tmp.name, arkfile), env=kaldi_path(), stdout=open(os.devnull, 'w').write) else: raise RuntimeError( 'ark format must be "text" or "binary", it is "{}"'.format(format))
def _run_command(self, command, verbose=True): """Run the command as a subprocess in a Kaldi environment""" if verbose is True: self.log.info('running %s', command) utils.jobs.run(command, stdout=self.log.debug, env=kaldi_path(), cwd=self.recipe_dir)
def _ark_to_dict_binary_bytext(arkfile): """Convert a binary ark to text, and load it as numpy arrays""" try: # copy-feats converts binary ark to text ark tempdir = tempfile.mkdtemp( dir=utils.config.get('abkhazia', 'tmp-directory')) txtfile = os.path.join(tempdir, 'txt') utils.jobs.run('copy-feats ark:{0} ark,t:{1}'.format(arkfile, txtfile), env=kaldi_path(), stdout=open(os.devnull, 'w').write) # load the converted text ark as a dict return _ark_to_dict_text(txtfile) finally: utils.remove(tempdir, safe=True)
def _compute_lm(self, G_arpa): """Generate an ARPA n-gram from an abkhazia corpus This method relies on the following Kaldi programs: add-start-end.sh, build-lm.sh and compile-lm. It uses the IRSTLM library. """ self.log.info('computing %s %s-gram in ARPA format', self.level, self.order) # cut -d' ' -f2 lm_text > text_ready. Train need to # remove utt-id on first column of text file lm_text = os.path.join(self.a2k._local_path(), 'lm_text.txt') lm_lines = utils.open_utf8(lm_text, 'r').readlines() text_ready = os.path.join(self.a2k._local_path(), 'text_ready.txt') with utils.open_utf8(text_ready, 'w') as ready: ready.write('\n'.join( [' '.join(line.split()[1:]) for line in lm_lines])) text_se = os.path.join(self.a2k._local_path(), 'text_se.txt') utils.jobs.run('add-start-end.sh', stdin=open(text_ready, 'r'), stdout=open(text_se, 'w').write, env=kaldi_path(), cwd=self.recipe_dir) assert os.path.isfile(text_se), 'LM failed on add-start-end' # k option is number of split, useful for huge text files # build-lm.sh in kaldi/tools/irstlm/bin text_lm = os.path.join(self.a2k._local_path(), 'text_lm.gz') self._run_command( 'build-lm.sh -i {0} -n {1} -o {2} -k 1 -s kneser-ney'.format( text_se, self.order, text_lm)) assert os.path.isfile(text_lm), 'LM failed on build-lm' text_blm = os.path.join(self.a2k._local_path(), 'text_blm.gz') self._run_command( # was with the -i option 'compile-lm {} --text=yes {}'.format(text_lm, text_blm)) # gzip the compiled lm (from # https://docs.python.org/2/library/gzip.html#examples-of-usage) with open(text_blm, 'rb') as fin, gzip.open(G_arpa, 'wb') as fout: shutil.copyfileobj(fin, fout)
def prepare_lang( corpus, output_dir, level='word', silence_probability=0.5, position_dependent_phones=False, keep_tmp_dirs=False, log=logger.null_logger()): """Wrapper on the Kaldi wsj/utils/prepare_lang.sh script Create the directory `output_dir` and populate it as described in http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating. It produces (among other files) the L.fst part of the HCLG model. Parameters: ----------- corpus (Corpus): abkhazia corpus to prepare lang for. output_dir (path): directory where to write prepared files, created if nonexisting. level ('word' or 'phone'): set to 'word' (the default) to prepare the corpus at word level, or 'phone' to prepare it at phone level. The prepared data will be used to train language and acoustic models at either word or phone level. silence_probability (float): The probability to have a silence phone. Usually 0.0 or 0.5, default is 0.5. position_dependent_phones (bool): default to False. Should be set to true or false depending on whether the language model produced is destined to be used with an acoustic model trained with or without word position dependent variants of the phones. keep_tmp_dir (bool): default to False. If true, keep the directories 'recipe' and 'local' in `output_dir`, if false remove them before returning. log (logger.Logging): the logger instance where to send messages, default is too disable the log. Return: ------- The return code of the Kaldi prepare_lang script. 0 for success, any other for error. """ output_dir = os.path.abspath(output_dir) log.info('preparing lexicon in %s (L.fst)...', output_dir) # init the kaldi recipe in output_dir/recipe a2k = Abkhazia2Kaldi( corpus, os.path.join(output_dir, 'recipe'), name='dict', log=log) a2k.setup_phones() a2k.setup_silences() a2k.setup_variants() a2k.setup_kaldi_folders() a2k.setup_machine_specific_scripts() if level == 'word': a2k.setup_lexicon() else: a2k.setup_phone_lexicon() # choosing the script according to level and word position # dependent phones. If word_position_dependent is true and the lm # is at the phone level, use prepare_lang_wpdpl.sh in the local # folder, otherwise we fall back to the original prepare_lang.sh # (some slight customizations of the script are necessary to # decode with a phone loop language model when word position # dependent phone variants have been trained). script_prepare_lm = os.path.join( a2k.kaldi_root, 'egs', 'wsj', 's5', 'utils', 'prepare_lang.sh') script_prepare_lm_wpdpl = os.path.join( a2k.share_dir, 'prepare_lang_wpdpl.sh') script = (script_prepare_lm_wpdpl if level == 'phone' and position_dependent_phones else script_prepare_lm) # generate the bash command we will run command = ( script + ' --position-dependent-phones {wpd}' ' --sil-prob {sil} {input} "<unk>" {temp} {output}'.format( wpd=bool2str(position_dependent_phones), sil=silence_probability, input=os.path.join(a2k._local_path()), temp=os.path.join(output_dir, 'local'), output=output_dir)) # run the command in Kaldi and forward its return code log.info('running "%s"', command) try: return jobs.run( command, cwd=a2k.recipe_dir, env=kaldi_path(), stdout=log.debug) finally: if not keep_tmp_dirs: remove(a2k.recipe_dir) remove(os.path.join(output_dir, 'local'))
def _format_lm(self, arpa_lm, fst_lm): """Converts ARPA-format language models to FSTs Change the LM vocabulary using SRILM. This is a Python implementation of Kaldi egs/wsj/s5/utils/format_lm_sri.sh, with margin modifications. Note: if you want to just convert ARPA LMs to FSTs, there is a simpler way to do this that doesn't require SRILM: see examples in Kaldi egs/wsj/s5/local/wsj_format_local_lms.sh """ self.log.info('converting ARPA to FST') words_txt = os.path.join(self.output_dir, 'words.txt') for _file in (arpa_lm, words_txt): if not os.path.isfile(_file): raise IOError('excpected input file {} to exist'.format(_file)) lm_base = os.path.splitext(os.path.basename(arpa_lm))[0] tempdir = tempfile.mkdtemp() try: # unzip the input LM. Removing all "illegal" combinations of # <s> and </s>, which are supposed to occur only at being/end # of utt. These can cause determinization failures of CLG # [ends up being epsilon cycles]. lm_txt = os.path.join(tempdir, lm_base + '.txt') # self.log.debug('unzip %s to %s', arpa_lm, lm_txt) with utils.open_utf8(lm_txt, 'w') as fp: for line in gzip.open(arpa_lm, 'rb'): if not (re.search('<s> <s>', line) or re.search( '</s> <s>', line) or re.search('</s> </s>', line)): fp.write(line.decode('utf-8')) # finds words in the arpa LM that are not symbols in the # OpenFst-format symbol table words.txt oovs = os.path.join(self.output_dir, 'oovs_{}.txt'.format(lm_base)) self.log.debug('write OOVs to %s', oovs) utils.jobs.run('utils/find_arpa_oovs.pl {} {}'.format( words_txt, lm_txt), stdout=utils.open_utf8(oovs, 'w').write, env=kaldi_path(), cwd=self.recipe_dir) # Change the LM vocabulary to be the intersection of the # current LM vocabulary and the set of words in the # pronunciation lexicon. This also renormalizes the LM by # recomputing the backoff weights, and remove those ngrams # whose probabilities are lower than the backed-off # estimates. lm_pruned = self._change_lm_vocab(lm_txt, words_txt) # convert from ARPA to FST self._run_command( 'utils/run.pl {0} arpa2fst {1} | fstprint | ' 'utils/eps2disambig.pl | utils/s2eps.pl | ' 'fstcompile --isymbols={2} --osymbols={2} ' '--keep_isymbols=false --keep_osymbols=false | ' 'fstrmepsilon | fstarcsort --sort_type=ilabel > {3}'.format( os.path.join(self.output_dir, 'format_lm.log'), lm_pruned, words_txt, fst_lm)) # The output is like: 9.14233e-05 -0.259833. We do expect # the first of these 2 numbers to be close to zero (the # second is nonzero because the backoff weights make the # states sum to >1). try: self._run_command('fstisstochastic {}'.format(fst_lm)) except RuntimeError: pass finally: utils.remove(tempdir, safe=True)