Python make_bigram_language_modelの例、gentle.language_model.make_bigram_language_model Pythonの例

コード例 #1

0

ファイルを表示

    def realign(chunk):
        wav_obj = wave.open(wavfile, 'r')

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        if duration < 0.01 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0].startOffset
        chunk_len = chunk['words'][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        wav_obj = wave.open(wavfile, 'r')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        # Adjust startOffset, endOffset, and timing to match originals
        for wd in word_alignment:
            if wd.end is not None:
                # Apply timing offset
                wd.start += start_t
                wd.end += start_t

            if wd.endOffset is not None:
                wd.startOffset += offset_offset
                wd.endOffset += offset_offset

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})

コード例 #2

0

ファイルを表示

ファイル: forced_aligner.py プロジェクト: wan/gentle

 def __init__(self, resources, transcript, nthreads=4, **kwargs):
     self.kwargs = kwargs
     self.nthreads = nthreads
     self.transcript = transcript
     self.resources = resources
     self.ms = metasentence.MetaSentence(transcript, resources.vocab)
     ks = self.ms.get_kaldi_sequence()
     gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs)
     self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads)
     self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads)

コード例 #3

0

ファイルを表示

ファイル: forced_aligner.py プロジェクト: bit/gentle

 def __init__(self, resources, transcript, nthreads=4, **kwargs):
     self.kwargs = kwargs
     self.nthreads = nthreads
     self.transcript = transcript
     self.resources = resources
     self.ms = metasentence.MetaSentence(transcript, resources.vocab)
     ks = self.ms.get_kaldi_sequence()
     gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs)
     self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads)
     self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads)

コード例 #4

0

ファイルを表示

ファイル: multipass.py プロジェクト: veravox/gentle

    def realign_sub(chunk):
        with wave.open(wavfile, "rb") as wav_obj:
            if chunk["start"] is None:
                start_t = 0
            else:
                start_t = chunk["start"].end

            if chunk["end"] is None:
                end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
            else:
                end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug(
                "cannot realign %d words with duration %f",
                len(chunk["words"]),
                duration,
            )
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        with wave.open(wavfile, "rb") as wav_obj:
            wav_obj.setpos(int(start_t * wav_obj.getframerate()))
            buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})

コード例 #5

0

ファイルを表示

ファイル: multipass.py プロジェクト: EnGassa/gentle

    def realign(chunk):
        wav_obj = wave.open(wavfile, 'r')

        start_t = (chunk["start"] or {"end": 0})["end"]
        end_t = chunk["end"]
        if end_t is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = end_t["start"]

        duration = end_t - start_t
        if duration < 0.01 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0]['startOffset']
        chunk_len = chunk['words'][-1]['endOffset'] - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, proto_langdir)
        k = standard_kaldi.Kaldi(
            get_resource('data/nnet_a_gpu_online'),
            chunk_gen_hclg_filename,
            proto_langdir)

        wav_obj = wave.open(wavfile, 'r')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = k.get_final()
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        # Adjust startOffset, endOffset, and timing to match originals
        for wd in word_alignment:
            if wd.get("end"):
                # Apply timing offset
                wd['start'] += start_t
                wd['end'] += start_t

            if wd.get("endOffset"):
                wd['startOffset'] += offset_offset
                wd['endOffset'] += offset_offset

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({"percent": len(realignments) / float(len(to_realign))})

コード例 #6

0

ファイルを表示

ファイル: serve.py プロジェクト: lowerquality/earmark

    def create_language_model(self):
        "sets and returns hclg_filename"

        gen_hclg_filename = language_model.make_bigram_language_model(self._command_seqs.values(), proto_langdir, conservative=True)

        # Overwrite old gen_hclg_filen
        if hasattr(self, 'gen_hclg_filename'):
            shutil.move(gen_hclg_filename, self.gen_hclg_filename)
            gen_hclg_filename = self.gen_hclg_filename

        return gen_hclg_filename

コード例 #7

0

ファイルを表示

ファイル: serve.py プロジェクト: afcarl/earmark

    def create_language_model(self):
        "sets and returns hclg_filename"

        gen_hclg_filename = language_model.make_bigram_language_model(
            self._command_seqs.values(), proto_langdir, conservative=True)

        # Overwrite old gen_hclg_filen
        if hasattr(self, 'gen_hclg_filename'):
            shutil.move(gen_hclg_filename, self.gen_hclg_filename)
            gen_hclg_filename = self.gen_hclg_filename

        return gen_hclg_filename

コード例 #8

0

ファイルを表示

ファイル: multipass.py プロジェクト: bit/gentle

    def realign(chunk):
        wav_obj = wave.open(wavfile, 'rb')

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0].startOffset
        chunk_len = chunk['words'][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(
            resources.nnet_gpu_path,
            chunk_gen_hclg_filename,
            resources.proto_langdir)

        wav_obj = wave.open(wavfile, 'rb')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({"percent": len(realignments) / float(len(to_realign))})

コード例 #9

0

ファイルを表示

ファイル: multipass.py プロジェクト: gibiansky/gentle

    def realign(chunk):
        nonlocal ignored

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = final_end_t
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk["words"]), duration))
            ignored += 1
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        realign_transcript = chunk_transcript.decode("utf-8").replace(
            "\n", " ")
        logging.debug("realign transcript: %s", realign_transcript)
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        wav_obj = wave.open(wavfile, "rb")
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))
        wav_obj.close()

        retries = 5
        while retries > 0:
            try:
                k = standard_kaldi.Kaldi(
                    resources.nnet_gpu_path,
                    chunk_gen_hclg_filename,
                    resources.proto_langdir,
                )
                k.push_chunk(buf)
                ret = [transcription.Word(**wd) for wd in k.get_final()]
                k.stop()
                break
            except BrokenPipeError:
                retries -= 1
                if retries == 0:
                    raise

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({
                "percent":
                (ignored + len(realignments)) / float(len(to_realign))
            })

コード例 #10

0

ファイルを表示

    def transcribe(self, uid, transcript, audio, async):

        proto_langdir = get_resource('PROTO_LANGDIR')
        
        status = self.get_status(uid)

        status['status'] = 'STARTED'
        output = {
            'transcript': transcript
        }

        outdir = os.path.join(self.data_dir, 'transcriptions', uid)                

        tran_path = os.path.join(outdir, 'transcript.txt')
        with codecs.open(tran_path, 'w', 'utf-8') as tranfile:
            tranfile.write(transcript)
        audio_path = os.path.join(outdir, 'upload')
        with open(audio_path, 'w') as wavfile:
            wavfile.write(audio)

        status['status'] = 'ENCODING'

        wavfile = os.path.join(outdir, 'a.wav')
        if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0:
            status['status'] = 'ERROR'
            status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        #XXX: Maybe we should pass this wave object instead of the
        # file path to align_progress
        wav_obj = wave.open(wavfile, 'r')
        status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate())
        status['status'] = 'TRANSCRIBING'

        def on_progress(p):
            for k,v in p.items():
                status[k] = v

        if len(transcript.strip()) > 0:
            ms = metasentence.MetaSentence(transcript, self.vocab)
            ks = ms.get_kaldi_sequence()
            gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir)

            kaldi_queue = Queue()
            for i in range(self.nthreads):
                kaldi_queue.put(standard_kaldi.Kaldi(
                    get_resource('data/nnet_a_gpu_online'),
                    gen_hclg_filename,
                    proto_langdir)
                )

            mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads)
        elif hasattr(self, 'full_transcriber'):
            mtt = self.full_transcriber
        else:
            status['status'] = 'ERROR'
            status['error']  = 'No transcript provided and no language model for full transcription'
            return

        words = mtt.transcribe(wavfile, progress_cb=on_progress)

        output = {}
        if len(transcript.strip()) > 0:
            # Clear queue (would this be gc'ed?)
            for i in range(self.nthreads):
                k = kaldi_queue.get()
                k.stop()

            # Align words
            output['words'] = diff_align.align(words, ms)
            output['transcript'] = transcript

            # Perform a second-pass with unaligned words
            logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))

            status['status'] = 'ALIGNING'

            output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress)

            logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))
            
        else:
            # Match format
            output = make_transcription_alignment({"words": words})

        # ...remove the original upload
        os.unlink(os.path.join(outdir, 'upload'))

        # Save
        with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
            json.dump(output, jsfile, indent=2)
        with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
            csvfile.write(to_csv(output))

        # Inline the alignment into the index.html file.
        htmltxt = open(get_resource('www/view_alignment.html')).read()
        htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output)));
        open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)

        status['status'] = 'OK'

        logging.info('done with transcription.')

        return output

コード例 #11

0

ファイルを表示

ファイル: generateLM.py プロジェクト: shreya2111/gentle-labs

    # Generating HCLG.fst (using Gentle here)
    txt_in = open(text).read()

    vocab_in = ms.load_vocabulary(
        open(proto_dir + "/tdnn_7b_chain_online/graph_pp/words.txt")
    )

    print("My Vocab", vocab_in)

    source_words_list = txt_in.split(" ")[1:]

    # We must supply a version of `words_in` that only has words within our vocabulary (ie. proto_langdir/words.txt)
    new_wdlist = []
    for wd in source_words_list:
        if wd not in vocab_in:
            new_wdlist.append(lm.OOV_TERM)
        else:
            new_wdlist.append(wd)

    print("Supplying these words", new_wdlist)

    HCLGFile = lm.make_bigram_language_model([new_wdlist], proto_dir)
    print(HCLGFile)

    # saving HCLG.fst in proto_langdir
    # renaming temp_HCLG.fst to HCLG.fst
    # storing it in proto_langdir (argv[2])
    shutil.move(HCLGFile, proto_dir + "/tdnn_7b_chain_online/graph_pp/HCLG.fst")