def init_deps(output_dir, log): """setup corpus, features and language model needed for am training""" tmpdir = tempfile.mkdtemp(dir='/dev/shm') try: # import Buckeye in abkhazia format corpus = buckeye.BuckeyePreparator(BUCKEYE_RAW, log=log).prepare( os.path.join(tmpdir, 'wavs'), keep_short_utts=False) train_corpus, test_corpus = corpus.split(train_prop=0.05, by_speakers=False) train_corpus.save(os.path.join(output_dir, 'train', 'data')) test_corpus.save(os.path.join(output_dir, 'test', 'data')) # compute features for train corpus (with default params from # config file) train_feats_dir = os.path.join(output_dir, 'train', 'features') feats = abkhazia.models.features.Features(train_corpus, train_feats_dir, log=log) feats.compute() # compute lm for train corpus (again with default params) train_lm_dir = os.path.join(output_dir, 'train', 'lm') lm = abkhazia.models.language_model.LanguageModel(train_corpus, train_lm_dir, log=log) lm.order = 2 lm.level = 'word' lm.compute() finally: # cleanup temp directory utils.remove(tmpdir)
def save(self, path, no_wavs=False, copy_wavs=True, force=False): """Save the corpus to the directory `path` :param str path: The output directory is assumed to be a non existing directory (or use force=True to overwrite it). :param bool no_wavs: when True, dont save the wavs (ie don't write wavs subdir in `path`) :param bool copy_wavs: when True, make a copy of the wavs instead of symbolic links :param bool force: when True, overwrite `path` if it is already existing :raise: OSError if force=False and `path` already exists """ self.log.info('saving corpus to %s', path) if force and os.path.exists(path): self.log.warning('overwriting existing path: %s', path) utils.remove(path) CorpusSaver.save(self, path, no_wavs=no_wavs, copy_wavs=copy_wavs)
def export(self): super(Features, self).export() # merge the features output scp files into a single one # 'feats.scp', and delete them, sort them in natural order to # preserve Kaldi ordering inputs = [ f for f in utils.list_files_with_extension( self.output_dir, '.scp', abspath=True, recursive=False) if 'raw_' in f ] inputs.sort(key=utils.natural_sort_keys) output_scp = os.path.join(self.output_dir, 'feats.scp') with open(output_scp, 'w') as outfile: for infile in inputs: outfile.write(open(infile, 'r').read()) utils.remove(infile) # export wav.scp, correct paths to be relative to corpus # instead of recipe_dir. TODO Do we really need a reference to # wavs as they are already referenced in the corpus ? origin = os.path.join(self.recipe_dir, 'data', self.name, 'wav.scp') if not os.path.isfile(origin): raise IOError('{} not found'.format(origin)) with open(os.path.join(self.output_dir, 'wav.scp'), 'w') as scp: for line in open(origin, 'r'): key = line.strip().split(' ')[0] assert key in self.corpus.wavs wav = os.path.join(self.corpus.wav_folder, key) scp.write('{} {}\n'.format(key, wav))
def _prepare_wavs_dir(self, wavs_dir, inputs, outputs): """Detect outputs already present and delete any undesired file""" self.log.debug('scanning %s', wavs_dir) target = dict((o, i) for i, o in zip(inputs, outputs)) found = 0 deleted = 0 for wav in os.listdir(wavs_dir): # the complete path to the wav file path = os.path.realpath(os.path.join(wavs_dir, wav)) # the target file is found in the directory, delete it if # it is empty, delete it it's a link and we force copying if wav in target and not self._broken_wav(path): del target[wav] found += 1 else: utils.remove(path) deleted += 1 self.log.debug('found %s files, deleted %s undesired files', found, deleted) # return the updated inputs and outputs return target.values(), target.keys()
def _delta_joblib_fnc(scp, instance): """A tweak to compute deltas inplace and in parallel using joblib class methods are not pickable so we pass a Features instance as a parameter instead of using self. scp is a str or a 1-length tuple containing the scp to compute delta on """ # filename of the input if isinstance(scp, tuple): scp = scp[0] # temp file for pseudo-inplace operation tmp = scp + '_tmp' try: # compute deltas to tmp instance._run_command( 'add-deltas --delta-order={0} scp:{1} ark:{2}'.format( instance.delta_order, scp, tmp), verbose=False) # move tmp to scp instance._run_command('copy-feats ark:{} ark,scp:{},{}'.format( tmp, scp.replace('.scp', '.ark'), scp), verbose=False) finally: utils.remove(tmp, safe=True)
def _compile_fst(self, G_txt, G_fst): """Compile and sort a text FST to kaldi binary FST This method relies on the Kaldi programs fstcompile and fstarcsort. """ self.log.info('compiling text FST to binary FST') temp = tempfile.NamedTemporaryFile('w', delete=False) try: # txt to temp command1 = ( 'fstcompile --isymbols={0} --osymbols={0}' ' --keep_isymbols=false --keep_osymbols=false {1}'.format( os.path.join(self.output_dir, 'words.txt'), G_txt)) self.log.debug('running %s > %s', command1, temp) utils.jobs.run(command1, temp.write) # temp to fst command2 = ('fstarcsort --sort_type=ilabel {}'.format(temp.name)) self.log.debug('running %s > %s', command2, G_fst) utils.jobs.run(command2, open(G_fst, 'w').write) finally: utils.remove(temp.name, safe=True)
def decode(decoder, graph_dir): decoder.log.info('fmllr decoding and computing WER') # generate option string for decoding decode_opts = ' '.join('--{} {}'.format(n, str(o)) for n, o in decoder.decode_opts.iteritems()) # generate option string for scoring score_opts = ' '.join('--{} {}'.format(n, str(o)) for n, o in decoder.score_opts.iteritems()) # add the reverse flag if enabled in the mkgraph options if decoder.mkgraph_opts['reverse'].value: score_opts += ' --reverse true' # decode_fmllr.sh must be run from a subdirectory of the input # acoustic model directory (here decoder.am_dir). So we do: create # a subdir in am_dir as a symlink to the target recipe_dir, run # the script in it, and finally delete the symlink. Moreover the # script make a speaker independant decoding so we use the tweak # again for si. # # TODO This is an error to assume write permission in # decoder.am_dir!! Instead we must copy (link) the required files # to decoder.recipe_dir (as in _decoder_nnet) try: target_sa = os.path.join(decoder.recipe_dir, 'decode') if not os.path.isdir(target_sa): os.makedirs(target_sa) tempdir_sa = os.path.join(decoder.am_dir, 'decode_fmllr') os.symlink(target_sa, tempdir_sa) target_si = os.path.join(decoder.recipe_dir, 'decode.si') if not os.path.isdir(target_si): os.makedirs(target_si) tempdir_si = os.path.join(decoder.am_dir, 'decode_fmllr.si') os.symlink(target_si, tempdir_si) decoder._run_command( ('steps/decode_fmllr.sh --nj {njobs} --cmd "{cmd}" ' '{decode_opts} {skip_scoring} --scoring-opts "{score_opts}" ' '{graph} {data} {decode}'.format( njobs=decoder.njobs, cmd=utils.config.get('kaldi', 'decode-cmd'), decode_opts=decode_opts, skip_scoring=_score.skip_scoring(decoder.score_opts), score_opts=_score.format(decoder.score_opts, decoder.mkgraph_opts), graph=graph_dir, data=os.path.join(decoder.recipe_dir, 'data', decoder.name), decode=tempdir_sa))) finally: # remove the two symlinks we created in input am_dir utils.remove(tempdir_si) utils.remove(tempdir_sa)
def __del__(self): try: # the corpus correction possibly create temporary files that # we delete here if self._erase_dict: utils.remove(self.dictionary) if self._erase_trs: utils.remove(self.transcription_dir) except AttributeError: pass
def _ark_to_dict_binary_bytext(arkfile): """Convert a binary ark to text, and load it as numpy arrays""" try: # copy-feats converts binary ark to text ark tempdir = tempfile.mkdtemp( dir=utils.config.get('abkhazia', 'tmp-directory')) txtfile = os.path.join(tempdir, 'txt') utils.jobs.run('copy-feats ark:{0} ark,t:{1}'.format(arkfile, txtfile), env=kaldi_path(), stdout=open(os.devnull, 'w').write) # load the converted text ark as a dict return _ark_to_dict_text(txtfile) finally: utils.remove(tempdir, safe=True)
def prepare_lang( corpus, output_dir, level='word', silence_probability=0.5, position_dependent_phones=False, keep_tmp_dirs=False, log=logger.null_logger()): """Wrapper on the Kaldi wsj/utils/prepare_lang.sh script Create the directory `output_dir` and populate it as described in http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating. It produces (among other files) the L.fst part of the HCLG model. Parameters: ----------- corpus (Corpus): abkhazia corpus to prepare lang for. output_dir (path): directory where to write prepared files, created if nonexisting. level ('word' or 'phone'): set to 'word' (the default) to prepare the corpus at word level, or 'phone' to prepare it at phone level. The prepared data will be used to train language and acoustic models at either word or phone level. silence_probability (float): The probability to have a silence phone. Usually 0.0 or 0.5, default is 0.5. position_dependent_phones (bool): default to False. Should be set to true or false depending on whether the language model produced is destined to be used with an acoustic model trained with or without word position dependent variants of the phones. keep_tmp_dir (bool): default to False. If true, keep the directories 'recipe' and 'local' in `output_dir`, if false remove them before returning. log (logger.Logging): the logger instance where to send messages, default is too disable the log. Return: ------- The return code of the Kaldi prepare_lang script. 0 for success, any other for error. """ output_dir = os.path.abspath(output_dir) log.info('preparing lexicon in %s (L.fst)...', output_dir) # init the kaldi recipe in output_dir/recipe a2k = Abkhazia2Kaldi( corpus, os.path.join(output_dir, 'recipe'), name='dict', log=log) a2k.setup_phones() a2k.setup_silences() a2k.setup_variants() a2k.setup_kaldi_folders() a2k.setup_machine_specific_scripts() if level == 'word': a2k.setup_lexicon() else: a2k.setup_phone_lexicon() # choosing the script according to level and word position # dependent phones. If word_position_dependent is true and the lm # is at the phone level, use prepare_lang_wpdpl.sh in the local # folder, otherwise we fall back to the original prepare_lang.sh # (some slight customizations of the script are necessary to # decode with a phone loop language model when word position # dependent phone variants have been trained). script_prepare_lm = os.path.join( a2k.kaldi_root, 'egs', 'wsj', 's5', 'utils', 'prepare_lang.sh') script_prepare_lm_wpdpl = os.path.join( a2k.share_dir, 'prepare_lang_wpdpl.sh') script = (script_prepare_lm_wpdpl if level == 'phone' and position_dependent_phones else script_prepare_lm) # generate the bash command we will run command = ( script + ' --position-dependent-phones {wpd}' ' --sil-prob {sil} {input} "<unk>" {temp} {output}'.format( wpd=bool2str(position_dependent_phones), sil=silence_probability, input=os.path.join(a2k._local_path()), temp=os.path.join(output_dir, 'local'), output=output_dir)) # run the command in Kaldi and forward its return code log.info('running "%s"', command) try: return jobs.run( command, cwd=a2k.recipe_dir, env=kaldi_path(), stdout=log.debug) finally: if not keep_tmp_dirs: remove(a2k.recipe_dir) remove(os.path.join(output_dir, 'local'))
def main(): # define and parse input arguments parser = argparse.ArgumentParser( description='Generate an ABX item file from the Buckeye corpus') parser.add_argument('item_file', metavar='ITEM_FILE', help='item file to be generated') parser.add_argument('-b', '--buckeye-dir', default=BUCKEYE_RAW, help='path to the raw Buckeye corpus to prepare' ', default is %(default)s') parser.add_argument( '-t', '--tmp-dir', default=tempfile.gettempdir(), help='temporary directory to use, default is %(default)s') args = parser.parse_args() # setup the log and tmpdir tmpdir = tempfile.mkdtemp(dir=args.tmp_dir) log = utils.logger.get_log(verbose=False, header_in_stdout=False) try: # import Buckeye in abkhazia format corpus = buckeye.BuckeyePreparator(args.buckeye_dir, log=log).prepare( os.path.join(tmpdir, 'wavs'), keep_short_utts=False) # remove undesired utterances (text is '<IVER>'). Few of those # utts cause the alignment step to bug... and in all case they are # useless. TODO find that bug! _len = len(corpus.utts()) corpus = corpus.subcorpus( [u for u in corpus.utts() if corpus.text[u] != '<IVER>']) log.info('removed %s utterances containing only "<IVER>"', _len - len(corpus.utts())) # get the manual phones alignment from the raw buckeye log.info('extracting manual alignments at phone level...') get_alignment = buckeye.GetAlignment(args.buckeye_dir) alignment = {} for utt in corpus.utts(): record, tstart, tstop = corpus.segments[utt] alignment[utt] = get_alignment(record, tstart, tstop) # save the alignment alignment_file = os.path.join(tmpdir, 'alignment.txt') open(alignment_file, 'w').write('\n'.join('{} {} {} {}'.format(utt, p[0], p[1], p[2]) for utt, phones in alignment.iteritems() for p in phones)) log.info('generating the item file...') alignment2item(corpus, alignment_file, args.item_file, verbose=1, njobs=1) finally: # cleanup temp directory utils.remove(tmpdir)
def __del__(self): try: if self.delete_recipe: utils.remove(self.recipe_dir, safe=True) except AttributeError: # if raised from __init__ pass
def _format_lm(self, arpa_lm, fst_lm): """Converts ARPA-format language models to FSTs Change the LM vocabulary using SRILM. This is a Python implementation of Kaldi egs/wsj/s5/utils/format_lm_sri.sh, with margin modifications. Note: if you want to just convert ARPA LMs to FSTs, there is a simpler way to do this that doesn't require SRILM: see examples in Kaldi egs/wsj/s5/local/wsj_format_local_lms.sh """ self.log.info('converting ARPA to FST') words_txt = os.path.join(self.output_dir, 'words.txt') for _file in (arpa_lm, words_txt): if not os.path.isfile(_file): raise IOError('excpected input file {} to exist'.format(_file)) lm_base = os.path.splitext(os.path.basename(arpa_lm))[0] tempdir = tempfile.mkdtemp() try: # unzip the input LM. Removing all "illegal" combinations of # <s> and </s>, which are supposed to occur only at being/end # of utt. These can cause determinization failures of CLG # [ends up being epsilon cycles]. lm_txt = os.path.join(tempdir, lm_base + '.txt') # self.log.debug('unzip %s to %s', arpa_lm, lm_txt) with utils.open_utf8(lm_txt, 'w') as fp: for line in gzip.open(arpa_lm, 'rb'): if not (re.search('<s> <s>', line) or re.search( '</s> <s>', line) or re.search('</s> </s>', line)): fp.write(line.decode('utf-8')) # finds words in the arpa LM that are not symbols in the # OpenFst-format symbol table words.txt oovs = os.path.join(self.output_dir, 'oovs_{}.txt'.format(lm_base)) self.log.debug('write OOVs to %s', oovs) utils.jobs.run('utils/find_arpa_oovs.pl {} {}'.format( words_txt, lm_txt), stdout=utils.open_utf8(oovs, 'w').write, env=kaldi_path(), cwd=self.recipe_dir) # Change the LM vocabulary to be the intersection of the # current LM vocabulary and the set of words in the # pronunciation lexicon. This also renormalizes the LM by # recomputing the backoff weights, and remove those ngrams # whose probabilities are lower than the backed-off # estimates. lm_pruned = self._change_lm_vocab(lm_txt, words_txt) # convert from ARPA to FST self._run_command( 'utils/run.pl {0} arpa2fst {1} | fstprint | ' 'utils/eps2disambig.pl | utils/s2eps.pl | ' 'fstcompile --isymbols={2} --osymbols={2} ' '--keep_isymbols=false --keep_osymbols=false | ' 'fstrmepsilon | fstarcsort --sort_type=ilabel > {3}'.format( os.path.join(self.output_dir, 'format_lm.log'), lm_pruned, words_txt, fst_lm)) # The output is like: 9.14233e-05 -0.259833. We do expect # the first of these 2 numbers to be close to zero (the # second is nonzero because the backoff weights make the # states sum to >1). try: self._run_command('fstisstochastic {}'.format(fst_lm)) except RuntimeError: pass finally: utils.remove(tempdir, safe=True)