Beispiel #1
0
def dict_to_ark(arkfile, data, format='text'):
    """Write a data dictionary to a Kaldi ark file

    TODO for now time information from h5f is lost in ark

    Parameters:
    -----------

    arkfile (str): path to the ark file to write

    data (dict): dictionary of numpy arrays to write

    format (str): must be 'text' or 'binary' to write a text or a
        binary ark file respectively, default is 'text'

    Raise:
    ------

    RuntimeError if format is not 'text' or 'binary'

    """
    if format == 'text':
        _dict_to_txt_ark(arkfile, data)
    elif format == 'binary':
        with tempfile.NamedTemporaryFile(
                dir=utils.config.get('abkhazia', 'tmp-directory')) as tmp:
            _dict_to_txt_ark(tmp.name, data)

            utils.jobs.run('copy-feats ark,t:{0} ark:{1}'.format(
                tmp.name, arkfile),
                           env=kaldi_path(),
                           stdout=open(os.devnull, 'w').write)
    else:
        raise RuntimeError(
            'ark format must be "text" or "binary", it is "{}"'.format(format))
Beispiel #2
0
    def _run_command(self, command, verbose=True):
        """Run the command as a subprocess in a Kaldi environment"""
        if verbose is True:
            self.log.info('running %s', command)

        utils.jobs.run(command,
                       stdout=self.log.debug,
                       env=kaldi_path(),
                       cwd=self.recipe_dir)
Beispiel #3
0
def _ark_to_dict_binary_bytext(arkfile):
    """Convert a binary ark to text, and load it as numpy arrays"""
    try:
        # copy-feats converts binary ark to text ark
        tempdir = tempfile.mkdtemp(
            dir=utils.config.get('abkhazia', 'tmp-directory'))
        txtfile = os.path.join(tempdir, 'txt')
        utils.jobs.run('copy-feats ark:{0} ark,t:{1}'.format(arkfile, txtfile),
                       env=kaldi_path(),
                       stdout=open(os.devnull, 'w').write)

        # load the converted text ark as a dict
        return _ark_to_dict_text(txtfile)
    finally:
        utils.remove(tempdir, safe=True)
Beispiel #4
0
    def _compute_lm(self, G_arpa):
        """Generate an ARPA n-gram from an abkhazia corpus

        This method relies on the following Kaldi programs:
        add-start-end.sh, build-lm.sh and compile-lm. It uses the
        IRSTLM library.

        """
        self.log.info('computing %s %s-gram in ARPA format', self.level,
                      self.order)

        # cut -d' ' -f2 lm_text > text_ready. Train need to
        # remove utt-id on first column of text file
        lm_text = os.path.join(self.a2k._local_path(), 'lm_text.txt')
        lm_lines = utils.open_utf8(lm_text, 'r').readlines()

        text_ready = os.path.join(self.a2k._local_path(), 'text_ready.txt')
        with utils.open_utf8(text_ready, 'w') as ready:
            ready.write('\n'.join(
                [' '.join(line.split()[1:]) for line in lm_lines]))

        text_se = os.path.join(self.a2k._local_path(), 'text_se.txt')
        utils.jobs.run('add-start-end.sh',
                       stdin=open(text_ready, 'r'),
                       stdout=open(text_se, 'w').write,
                       env=kaldi_path(),
                       cwd=self.recipe_dir)
        assert os.path.isfile(text_se), 'LM failed on add-start-end'

        # k option is number of split, useful for huge text files
        # build-lm.sh in kaldi/tools/irstlm/bin
        text_lm = os.path.join(self.a2k._local_path(), 'text_lm.gz')
        self._run_command(
            'build-lm.sh -i {0} -n {1} -o {2} -k 1 -s kneser-ney'.format(
                text_se, self.order, text_lm))
        assert os.path.isfile(text_lm), 'LM failed on build-lm'

        text_blm = os.path.join(self.a2k._local_path(), 'text_blm.gz')
        self._run_command(
            # was with the -i option
            'compile-lm {} --text=yes {}'.format(text_lm, text_blm))

        # gzip the compiled lm (from
        # https://docs.python.org/2/library/gzip.html#examples-of-usage)
        with open(text_blm, 'rb') as fin, gzip.open(G_arpa, 'wb') as fout:
            shutil.copyfileobj(fin, fout)
Beispiel #5
0
def prepare_lang(
        corpus,
        output_dir,
        level='word',
        silence_probability=0.5,
        position_dependent_phones=False,
        keep_tmp_dirs=False,
        log=logger.null_logger()):
    """Wrapper on the Kaldi wsj/utils/prepare_lang.sh script

    Create the directory `output_dir` and populate it as described in
    http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating. It
    produces (among other files) the L.fst part of the HCLG model.

    Parameters:
    -----------

    corpus (Corpus): abkhazia corpus to prepare lang for.

    output_dir (path): directory where to write prepared files,
      created if nonexisting.

    level ('word' or 'phone'): set to 'word' (the default) to prepare
      the corpus at word level, or 'phone' to prepare it at phone
      level. The prepared data will be used to train language and
      acoustic models at either word or phone level.

    silence_probability (float): The probability to have a silence
      phone. Usually 0.0 or 0.5, default is 0.5.

    position_dependent_phones (bool): default to False. Should be set
      to true or false depending on whether the language model
      produced is destined to be used with an acoustic model trained
      with or without word position dependent variants of the phones.

    keep_tmp_dir (bool): default to False. If true, keep the
      directories 'recipe' and 'local' in `output_dir`, if false
      remove them before returning.

    log (logger.Logging): the logger instance where to send messages,
      default is too disable the log.

    Return:
    -------

    The return code of the Kaldi prepare_lang script. 0 for success,
    any other for error.

    """
    output_dir = os.path.abspath(output_dir)
    log.info('preparing lexicon in %s (L.fst)...', output_dir)

    # init the kaldi recipe in output_dir/recipe
    a2k = Abkhazia2Kaldi(
        corpus, os.path.join(output_dir, 'recipe'), name='dict', log=log)

    a2k.setup_phones()
    a2k.setup_silences()
    a2k.setup_variants()
    a2k.setup_kaldi_folders()
    a2k.setup_machine_specific_scripts()

    if level == 'word':
        a2k.setup_lexicon()
    else:
        a2k.setup_phone_lexicon()

    # choosing the script according to level and word position
    # dependent phones. If word_position_dependent is true and the lm
    # is at the phone level, use prepare_lang_wpdpl.sh in the local
    # folder, otherwise we fall back to the original prepare_lang.sh
    # (some slight customizations of the script are necessary to
    # decode with a phone loop language model when word position
    # dependent phone variants have been trained).
    script_prepare_lm = os.path.join(
        a2k.kaldi_root, 'egs', 'wsj', 's5', 'utils', 'prepare_lang.sh')

    script_prepare_lm_wpdpl = os.path.join(
        a2k.share_dir, 'prepare_lang_wpdpl.sh')

    script = (script_prepare_lm_wpdpl
              if level == 'phone' and position_dependent_phones
              else script_prepare_lm)

    # generate the bash command we will run
    command = (
        script + ' --position-dependent-phones {wpd}'
        ' --sil-prob {sil} {input} "<unk>" {temp} {output}'.format(
            wpd=bool2str(position_dependent_phones),
            sil=silence_probability,
            input=os.path.join(a2k._local_path()),
            temp=os.path.join(output_dir, 'local'),
            output=output_dir))

    # run the command in Kaldi and forward its return code
    log.info('running "%s"', command)
    try:
        return jobs.run(
            command, cwd=a2k.recipe_dir, env=kaldi_path(), stdout=log.debug)
    finally:
        if not keep_tmp_dirs:
            remove(a2k.recipe_dir)
            remove(os.path.join(output_dir, 'local'))
Beispiel #6
0
    def _format_lm(self, arpa_lm, fst_lm):
        """Converts ARPA-format language models to FSTs

        Change the LM vocabulary using SRILM. This is a Python
        implementation of Kaldi egs/wsj/s5/utils/format_lm_sri.sh,
        with margin modifications.

        Note: if you want to just convert ARPA LMs to FSTs, there is a
        simpler way to do this that doesn't require SRILM: see
        examples in Kaldi egs/wsj/s5/local/wsj_format_local_lms.sh

        """
        self.log.info('converting ARPA to FST')

        words_txt = os.path.join(self.output_dir, 'words.txt')
        for _file in (arpa_lm, words_txt):
            if not os.path.isfile(_file):
                raise IOError('excpected input file {} to exist'.format(_file))

        lm_base = os.path.splitext(os.path.basename(arpa_lm))[0]
        tempdir = tempfile.mkdtemp()
        try:
            # unzip the input LM. Removing all "illegal" combinations of
            # <s> and </s>, which are supposed to occur only at being/end
            # of utt. These can cause determinization failures of CLG
            # [ends up being epsilon cycles].
            lm_txt = os.path.join(tempdir, lm_base + '.txt')
            # self.log.debug('unzip %s to %s', arpa_lm, lm_txt)
            with utils.open_utf8(lm_txt, 'w') as fp:
                for line in gzip.open(arpa_lm, 'rb'):
                    if not (re.search('<s> <s>', line) or re.search(
                            '</s> <s>', line) or re.search('</s> </s>', line)):
                        fp.write(line.decode('utf-8'))

            # finds words in the arpa LM that are not symbols in the
            # OpenFst-format symbol table words.txt
            oovs = os.path.join(self.output_dir, 'oovs_{}.txt'.format(lm_base))
            self.log.debug('write OOVs to %s', oovs)
            utils.jobs.run('utils/find_arpa_oovs.pl {} {}'.format(
                words_txt, lm_txt),
                           stdout=utils.open_utf8(oovs, 'w').write,
                           env=kaldi_path(),
                           cwd=self.recipe_dir)

            # Change the LM vocabulary to be the intersection of the
            # current LM vocabulary and the set of words in the
            # pronunciation lexicon. This also renormalizes the LM by
            # recomputing the backoff weights, and remove those ngrams
            # whose probabilities are lower than the backed-off
            # estimates.
            lm_pruned = self._change_lm_vocab(lm_txt, words_txt)

            # convert from ARPA to FST
            self._run_command(
                'utils/run.pl {0} arpa2fst {1} | fstprint | '
                'utils/eps2disambig.pl | utils/s2eps.pl | '
                'fstcompile --isymbols={2} --osymbols={2} '
                '--keep_isymbols=false --keep_osymbols=false | '
                'fstrmepsilon | fstarcsort --sort_type=ilabel > {3}'.format(
                    os.path.join(self.output_dir, 'format_lm.log'), lm_pruned,
                    words_txt, fst_lm))

            # The output is like: 9.14233e-05 -0.259833. We do expect
            # the first of these 2 numbers to be close to zero (the
            # second is nonzero because the backoff weights make the
            # states sum to >1).
            try:
                self._run_command('fstisstochastic {}'.format(fst_lm))
            except RuntimeError:
                pass

        finally:
            utils.remove(tempdir, safe=True)