Example #1
0
def init_deps(output_dir, log):
    """setup corpus, features and language model needed for am training"""
    tmpdir = tempfile.mkdtemp(dir='/dev/shm')

    try:
        # import Buckeye in abkhazia format
        corpus = buckeye.BuckeyePreparator(BUCKEYE_RAW, log=log).prepare(
            os.path.join(tmpdir, 'wavs'), keep_short_utts=False)
        train_corpus, test_corpus = corpus.split(train_prop=0.05,
                                                 by_speakers=False)
        train_corpus.save(os.path.join(output_dir, 'train', 'data'))
        test_corpus.save(os.path.join(output_dir, 'test', 'data'))

        # compute features for train corpus (with default params from
        # config file)
        train_feats_dir = os.path.join(output_dir, 'train', 'features')
        feats = abkhazia.models.features.Features(train_corpus,
                                                  train_feats_dir,
                                                  log=log)
        feats.compute()

        # compute lm for train corpus (again with default params)
        train_lm_dir = os.path.join(output_dir, 'train', 'lm')
        lm = abkhazia.models.language_model.LanguageModel(train_corpus,
                                                          train_lm_dir,
                                                          log=log)
        lm.order = 2
        lm.level = 'word'
        lm.compute()

    finally:
        # cleanup temp directory
        utils.remove(tmpdir)
Example #2
0
    def save(self, path, no_wavs=False, copy_wavs=True, force=False):
        """Save the corpus to the directory `path`

        :param str path: The output directory is assumed to be a non
            existing directory (or use force=True to overwrite it).

        :param bool no_wavs: when True, dont save the wavs (ie don't
            write wavs subdir in `path`)

        :param bool copy_wavs: when True, make a copy of the wavs
            instead of symbolic links

        :param bool force: when True, overwrite `path` if it is
            already existing

        :raise: OSError if force=False and `path` already exists

        """
        self.log.info('saving corpus to %s', path)

        if force and os.path.exists(path):
            self.log.warning('overwriting existing path: %s', path)
            utils.remove(path)

        CorpusSaver.save(self, path, no_wavs=no_wavs, copy_wavs=copy_wavs)
Example #3
0
    def export(self):
        super(Features, self).export()

        # merge the features output scp files into a single one
        # 'feats.scp', and delete them, sort them in natural order to
        # preserve Kaldi ordering
        inputs = [
            f for f in utils.list_files_with_extension(
                self.output_dir, '.scp', abspath=True, recursive=False)
            if 'raw_' in f
        ]
        inputs.sort(key=utils.natural_sort_keys)

        output_scp = os.path.join(self.output_dir, 'feats.scp')
        with open(output_scp, 'w') as outfile:
            for infile in inputs:
                outfile.write(open(infile, 'r').read())
                utils.remove(infile)

        # export wav.scp, correct paths to be relative to corpus
        # instead of recipe_dir. TODO Do we really need a reference to
        # wavs as they are already referenced in the corpus ?
        origin = os.path.join(self.recipe_dir, 'data', self.name, 'wav.scp')
        if not os.path.isfile(origin):
            raise IOError('{} not found'.format(origin))

        with open(os.path.join(self.output_dir, 'wav.scp'), 'w') as scp:
            for line in open(origin, 'r'):
                key = line.strip().split(' ')[0]
                assert key in self.corpus.wavs
                wav = os.path.join(self.corpus.wav_folder, key)
                scp.write('{} {}\n'.format(key, wav))
    def _prepare_wavs_dir(self, wavs_dir, inputs, outputs):
        """Detect outputs already present and delete any undesired file"""
        self.log.debug('scanning %s', wavs_dir)

        target = dict((o, i) for i, o in zip(inputs, outputs))
        found = 0
        deleted = 0
        for wav in os.listdir(wavs_dir):
            # the complete path to the wav file
            path = os.path.realpath(os.path.join(wavs_dir, wav))

            # the target file is found in the directory, delete it if
            # it is empty, delete it it's a link and we force copying
            if wav in target and not self._broken_wav(path):
                del target[wav]
                found += 1
            else:
                utils.remove(path)
                deleted += 1

        self.log.debug('found %s files, deleted %s undesired files', found,
                       deleted)

        # return the updated inputs and outputs
        return target.values(), target.keys()
Example #5
0
def _delta_joblib_fnc(scp, instance):
    """A tweak to compute deltas inplace and in parallel using joblib

    class methods are not pickable so we pass a Features instance as a
    parameter instead of using self.

    scp is a str or a 1-length tuple containing the scp to compute delta on

    """
    # filename of the input
    if isinstance(scp, tuple):
        scp = scp[0]

    # temp file for pseudo-inplace operation
    tmp = scp + '_tmp'

    try:
        # compute deltas to tmp
        instance._run_command(
            'add-deltas --delta-order={0} scp:{1} ark:{2}'.format(
                instance.delta_order, scp, tmp),
            verbose=False)

        # move tmp to scp
        instance._run_command('copy-feats ark:{} ark,scp:{},{}'.format(
            tmp, scp.replace('.scp', '.ark'), scp),
                              verbose=False)
    finally:
        utils.remove(tmp, safe=True)
Example #6
0
    def _compile_fst(self, G_txt, G_fst):
        """Compile and sort a text FST to kaldi binary FST

        This method relies on the Kaldi programs fstcompile and
        fstarcsort.

        """
        self.log.info('compiling text FST to binary FST')

        temp = tempfile.NamedTemporaryFile('w', delete=False)
        try:
            # txt to temp
            command1 = (
                'fstcompile --isymbols={0} --osymbols={0}'
                ' --keep_isymbols=false --keep_osymbols=false {1}'.format(
                    os.path.join(self.output_dir, 'words.txt'), G_txt))
            self.log.debug('running %s > %s', command1, temp)
            utils.jobs.run(command1, temp.write)

            # temp to fst
            command2 = ('fstarcsort --sort_type=ilabel {}'.format(temp.name))
            self.log.debug('running %s > %s', command2, G_fst)
            utils.jobs.run(command2, open(G_fst, 'w').write)

        finally:
            utils.remove(temp.name, safe=True)
Example #7
0
def decode(decoder, graph_dir):
    decoder.log.info('fmllr decoding and computing WER')

    # generate option string for decoding
    decode_opts = ' '.join('--{} {}'.format(n, str(o))
                           for n, o in decoder.decode_opts.iteritems())

    # generate option string for scoring
    score_opts = ' '.join('--{} {}'.format(n, str(o))
                          for n, o in decoder.score_opts.iteritems())

    # add the reverse flag if enabled in the mkgraph options
    if decoder.mkgraph_opts['reverse'].value:
        score_opts += ' --reverse true'

    # decode_fmllr.sh must be run from a subdirectory of the input
    # acoustic model directory (here decoder.am_dir). So we do: create
    # a subdir in am_dir as a symlink to the target recipe_dir, run
    # the script in it, and finally delete the symlink. Moreover the
    # script make a speaker independant decoding so we use the tweak
    # again for si.
    #
    # TODO This is an error to assume write permission in
    # decoder.am_dir!! Instead we must copy (link) the required files
    # to decoder.recipe_dir (as in _decoder_nnet)
    try:
        target_sa = os.path.join(decoder.recipe_dir, 'decode')
        if not os.path.isdir(target_sa):
            os.makedirs(target_sa)
        tempdir_sa = os.path.join(decoder.am_dir, 'decode_fmllr')
        os.symlink(target_sa, tempdir_sa)

        target_si = os.path.join(decoder.recipe_dir, 'decode.si')
        if not os.path.isdir(target_si):
            os.makedirs(target_si)
        tempdir_si = os.path.join(decoder.am_dir, 'decode_fmllr.si')
        os.symlink(target_si, tempdir_si)

        decoder._run_command(
            ('steps/decode_fmllr.sh --nj {njobs} --cmd "{cmd}" '
             '{decode_opts} {skip_scoring} --scoring-opts "{score_opts}" '
             '{graph} {data} {decode}'.format(
                 njobs=decoder.njobs,
                 cmd=utils.config.get('kaldi', 'decode-cmd'),
                 decode_opts=decode_opts,
                 skip_scoring=_score.skip_scoring(decoder.score_opts),
                 score_opts=_score.format(decoder.score_opts,
                                          decoder.mkgraph_opts),
                 graph=graph_dir,
                 data=os.path.join(decoder.recipe_dir, 'data', decoder.name),
                 decode=tempdir_sa)))
    finally:
        # remove the two symlinks we created in input am_dir
        utils.remove(tempdir_si)
        utils.remove(tempdir_sa)
    def __del__(self):
        try:
            # the corpus correction possibly create temporary files that
            # we delete here
            if self._erase_dict:
                utils.remove(self.dictionary)

            if self._erase_trs:
                utils.remove(self.transcription_dir)
        except AttributeError:
            pass
Example #9
0
def _ark_to_dict_binary_bytext(arkfile):
    """Convert a binary ark to text, and load it as numpy arrays"""
    try:
        # copy-feats converts binary ark to text ark
        tempdir = tempfile.mkdtemp(
            dir=utils.config.get('abkhazia', 'tmp-directory'))
        txtfile = os.path.join(tempdir, 'txt')
        utils.jobs.run('copy-feats ark:{0} ark,t:{1}'.format(arkfile, txtfile),
                       env=kaldi_path(),
                       stdout=open(os.devnull, 'w').write)

        # load the converted text ark as a dict
        return _ark_to_dict_text(txtfile)
    finally:
        utils.remove(tempdir, safe=True)
Example #10
0
def prepare_lang(
        corpus,
        output_dir,
        level='word',
        silence_probability=0.5,
        position_dependent_phones=False,
        keep_tmp_dirs=False,
        log=logger.null_logger()):
    """Wrapper on the Kaldi wsj/utils/prepare_lang.sh script

    Create the directory `output_dir` and populate it as described in
    http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating. It
    produces (among other files) the L.fst part of the HCLG model.

    Parameters:
    -----------

    corpus (Corpus): abkhazia corpus to prepare lang for.

    output_dir (path): directory where to write prepared files,
      created if nonexisting.

    level ('word' or 'phone'): set to 'word' (the default) to prepare
      the corpus at word level, or 'phone' to prepare it at phone
      level. The prepared data will be used to train language and
      acoustic models at either word or phone level.

    silence_probability (float): The probability to have a silence
      phone. Usually 0.0 or 0.5, default is 0.5.

    position_dependent_phones (bool): default to False. Should be set
      to true or false depending on whether the language model
      produced is destined to be used with an acoustic model trained
      with or without word position dependent variants of the phones.

    keep_tmp_dir (bool): default to False. If true, keep the
      directories 'recipe' and 'local' in `output_dir`, if false
      remove them before returning.

    log (logger.Logging): the logger instance where to send messages,
      default is too disable the log.

    Return:
    -------

    The return code of the Kaldi prepare_lang script. 0 for success,
    any other for error.

    """
    output_dir = os.path.abspath(output_dir)
    log.info('preparing lexicon in %s (L.fst)...', output_dir)

    # init the kaldi recipe in output_dir/recipe
    a2k = Abkhazia2Kaldi(
        corpus, os.path.join(output_dir, 'recipe'), name='dict', log=log)

    a2k.setup_phones()
    a2k.setup_silences()
    a2k.setup_variants()
    a2k.setup_kaldi_folders()
    a2k.setup_machine_specific_scripts()

    if level == 'word':
        a2k.setup_lexicon()
    else:
        a2k.setup_phone_lexicon()

    # choosing the script according to level and word position
    # dependent phones. If word_position_dependent is true and the lm
    # is at the phone level, use prepare_lang_wpdpl.sh in the local
    # folder, otherwise we fall back to the original prepare_lang.sh
    # (some slight customizations of the script are necessary to
    # decode with a phone loop language model when word position
    # dependent phone variants have been trained).
    script_prepare_lm = os.path.join(
        a2k.kaldi_root, 'egs', 'wsj', 's5', 'utils', 'prepare_lang.sh')

    script_prepare_lm_wpdpl = os.path.join(
        a2k.share_dir, 'prepare_lang_wpdpl.sh')

    script = (script_prepare_lm_wpdpl
              if level == 'phone' and position_dependent_phones
              else script_prepare_lm)

    # generate the bash command we will run
    command = (
        script + ' --position-dependent-phones {wpd}'
        ' --sil-prob {sil} {input} "<unk>" {temp} {output}'.format(
            wpd=bool2str(position_dependent_phones),
            sil=silence_probability,
            input=os.path.join(a2k._local_path()),
            temp=os.path.join(output_dir, 'local'),
            output=output_dir))

    # run the command in Kaldi and forward its return code
    log.info('running "%s"', command)
    try:
        return jobs.run(
            command, cwd=a2k.recipe_dir, env=kaldi_path(), stdout=log.debug)
    finally:
        if not keep_tmp_dirs:
            remove(a2k.recipe_dir)
            remove(os.path.join(output_dir, 'local'))
Example #11
0
def main():
    # define and parse input arguments
    parser = argparse.ArgumentParser(
        description='Generate an ABX item file from the Buckeye corpus')
    parser.add_argument('item_file',
                        metavar='ITEM_FILE',
                        help='item file to be generated')
    parser.add_argument('-b',
                        '--buckeye-dir',
                        default=BUCKEYE_RAW,
                        help='path to the raw Buckeye corpus to prepare'
                        ', default is %(default)s')
    parser.add_argument(
        '-t',
        '--tmp-dir',
        default=tempfile.gettempdir(),
        help='temporary directory to use, default is %(default)s')
    args = parser.parse_args()

    # setup the log and tmpdir
    tmpdir = tempfile.mkdtemp(dir=args.tmp_dir)
    log = utils.logger.get_log(verbose=False, header_in_stdout=False)

    try:
        # import Buckeye in abkhazia format
        corpus = buckeye.BuckeyePreparator(args.buckeye_dir, log=log).prepare(
            os.path.join(tmpdir, 'wavs'), keep_short_utts=False)

        # remove undesired utterances (text is '<IVER>'). Few of those
        # utts cause the alignment step to bug... and in all case they are
        # useless. TODO find that bug!
        _len = len(corpus.utts())
        corpus = corpus.subcorpus(
            [u for u in corpus.utts() if corpus.text[u] != '<IVER>'])
        log.info('removed %s utterances containing only "<IVER>"',
                 _len - len(corpus.utts()))

        # get the manual phones alignment from the raw buckeye
        log.info('extracting manual alignments at phone level...')
        get_alignment = buckeye.GetAlignment(args.buckeye_dir)
        alignment = {}
        for utt in corpus.utts():
            record, tstart, tstop = corpus.segments[utt]
            alignment[utt] = get_alignment(record, tstart, tstop)

        # save the alignment
        alignment_file = os.path.join(tmpdir, 'alignment.txt')
        open(alignment_file,
             'w').write('\n'.join('{} {} {} {}'.format(utt, p[0], p[1], p[2])
                                  for utt, phones in alignment.iteritems()
                                  for p in phones))

        log.info('generating the item file...')
        alignment2item(corpus,
                       alignment_file,
                       args.item_file,
                       verbose=1,
                       njobs=1)

    finally:
        # cleanup temp directory
        utils.remove(tmpdir)
Example #12
0
 def __del__(self):
     try:
         if self.delete_recipe:
             utils.remove(self.recipe_dir, safe=True)
     except AttributeError:  # if raised from __init__
         pass
Example #13
0
    def _format_lm(self, arpa_lm, fst_lm):
        """Converts ARPA-format language models to FSTs

        Change the LM vocabulary using SRILM. This is a Python
        implementation of Kaldi egs/wsj/s5/utils/format_lm_sri.sh,
        with margin modifications.

        Note: if you want to just convert ARPA LMs to FSTs, there is a
        simpler way to do this that doesn't require SRILM: see
        examples in Kaldi egs/wsj/s5/local/wsj_format_local_lms.sh

        """
        self.log.info('converting ARPA to FST')

        words_txt = os.path.join(self.output_dir, 'words.txt')
        for _file in (arpa_lm, words_txt):
            if not os.path.isfile(_file):
                raise IOError('excpected input file {} to exist'.format(_file))

        lm_base = os.path.splitext(os.path.basename(arpa_lm))[0]
        tempdir = tempfile.mkdtemp()
        try:
            # unzip the input LM. Removing all "illegal" combinations of
            # <s> and </s>, which are supposed to occur only at being/end
            # of utt. These can cause determinization failures of CLG
            # [ends up being epsilon cycles].
            lm_txt = os.path.join(tempdir, lm_base + '.txt')
            # self.log.debug('unzip %s to %s', arpa_lm, lm_txt)
            with utils.open_utf8(lm_txt, 'w') as fp:
                for line in gzip.open(arpa_lm, 'rb'):
                    if not (re.search('<s> <s>', line) or re.search(
                            '</s> <s>', line) or re.search('</s> </s>', line)):
                        fp.write(line.decode('utf-8'))

            # finds words in the arpa LM that are not symbols in the
            # OpenFst-format symbol table words.txt
            oovs = os.path.join(self.output_dir, 'oovs_{}.txt'.format(lm_base))
            self.log.debug('write OOVs to %s', oovs)
            utils.jobs.run('utils/find_arpa_oovs.pl {} {}'.format(
                words_txt, lm_txt),
                           stdout=utils.open_utf8(oovs, 'w').write,
                           env=kaldi_path(),
                           cwd=self.recipe_dir)

            # Change the LM vocabulary to be the intersection of the
            # current LM vocabulary and the set of words in the
            # pronunciation lexicon. This also renormalizes the LM by
            # recomputing the backoff weights, and remove those ngrams
            # whose probabilities are lower than the backed-off
            # estimates.
            lm_pruned = self._change_lm_vocab(lm_txt, words_txt)

            # convert from ARPA to FST
            self._run_command(
                'utils/run.pl {0} arpa2fst {1} | fstprint | '
                'utils/eps2disambig.pl | utils/s2eps.pl | '
                'fstcompile --isymbols={2} --osymbols={2} '
                '--keep_isymbols=false --keep_osymbols=false | '
                'fstrmepsilon | fstarcsort --sort_type=ilabel > {3}'.format(
                    os.path.join(self.output_dir, 'format_lm.log'), lm_pruned,
                    words_txt, fst_lm))

            # The output is like: 9.14233e-05 -0.259833. We do expect
            # the first of these 2 numbers to be close to zero (the
            # second is nonzero because the backoff weights make the
            # states sum to >1).
            try:
                self._run_command('fstisstochastic {}'.format(fst_lm))
            except RuntimeError:
                pass

        finally:
            utils.remove(tempdir, safe=True)