Esempio n. 1
0
    def transcribe(self, wavfile, progress_cb=None):
        wav_obj = wave.open(wavfile, 'r')
        duration = wav_obj.getnframes() / float(wav_obj.getframerate())
        n_chunks = int(
            math.ceil(duration / float(self.chunk_len - self.overlap_t)))

        chunks = []

        def transcribe_chunk(idx):
            wav_obj = wave.open(wavfile, 'r')
            start_t = idx * (self.chunk_len - self.overlap_t)
            # Seek
            wav_obj.setpos(int(start_t * wav_obj.getframerate()))
            # Read frames
            buf = wav_obj.readframes(
                int(self.chunk_len * wav_obj.getframerate()))

            k = self.kaldi_queue.get()
            k.push_chunk(buf)
            ret = k.get_final()
            k.reset()
            self.kaldi_queue.put(k)

            chunks.append({"start": start_t, "words": ret})
            logging.info('%d/%d' % (len(chunks), n_chunks))
            if progress_cb is not None:
                progress_cb({
                    "message": ' '.join([X['word'] for X in ret]),
                    "percent": len(chunks) / float(n_chunks)
                })

        pool = Pool(min(n_chunks, self.nthreads))
        pool.map(transcribe_chunk, range(n_chunks))
        pool.close()

        chunks.sort(key=lambda x: x['start'])

        # Combine chunks
        words = []
        for c in chunks:
            chunk_start = c['start']
            for wd in c['words']:
                wd['start'] += chunk_start
                words.append(transcription.Word(**wd))

        # Remove overlap:  Sort by time, then filter out any Word entries in
        # the list that are adjacent to another entry corresponding to the same
        # word in the audio.
        words.sort(key=lambda word: word.start)
        words.append(transcription.Word(word="__dummy__"))
        words = [
            words[i] for i in range(len(words) - 1)
            if not words[i].corresponds(words[i + 1])
        ]

        return words
Esempio n. 2
0
    def realign(chunk):
        wav_obj = wave.open(wavfile, 'r')

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        if duration < 0.01 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0].startOffset
        chunk_len = chunk['words'][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        wav_obj = wave.open(wavfile, 'r')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        # Adjust startOffset, endOffset, and timing to match originals
        for wd in word_alignment:
            if wd.end is not None:
                # Apply timing offset
                wd.start += start_t
                wd.end += start_t

            if wd.endOffset is not None:
                wd.startOffset += offset_offset
                wd.endOffset += offset_offset

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
Esempio n. 3
0
    def realign_sub(chunk):
        with wave.open(wavfile, "rb") as wav_obj:
            if chunk["start"] is None:
                start_t = 0
            else:
                start_t = chunk["start"].end

            if chunk["end"] is None:
                end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
            else:
                end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug(
                "cannot realign %d words with duration %f",
                len(chunk["words"]),
                duration,
            )
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        with wave.open(wavfile, "rb") as wav_obj:
            wav_obj.setpos(int(start_t * wav_obj.getframerate()))
            buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
Esempio n. 4
0
    def transcribe(self, wavfile, progress_cb=None):
        wav_obj = wave.open(wavfile, 'r')
        duration = wav_obj.getnframes() / float(wav_obj.getframerate())
        n_chunks = int(
            math.ceil(duration / float(self.chunk_len - self.overlap_t)))

        chunks = []

        def transcribe_chunk(idx):
            wav_obj = wave.open(wavfile, 'r')
            start_t = idx * (self.chunk_len - self.overlap_t)
            # Seek
            wav_obj.setpos(int(start_t * wav_obj.getframerate()))
            # Read frames
            buf = wav_obj.readframes(
                int(self.chunk_len * wav_obj.getframerate()))

            k = self.kaldi_queue.get()
            k.push_chunk(buf)
            ret = k.get_final()
            k.reset()
            self.kaldi_queue.put(k)

            chunks.append({"start": start_t, "words": ret})
            logging.info('%d/%d' % (len(chunks), n_chunks))
            if progress_cb is not None:
                progress_cb({
                    "message": ' '.join([X['word'] for X in ret]),
                    "percent": len(chunks) / float(n_chunks)
                })

        pool = Pool(min(n_chunks, self.nthreads))
        pool.map(transcribe_chunk, range(n_chunks))
        pool.close()

        chunks.sort(key=lambda x: x['start'])

        # Combine chunks
        words = []
        for c in chunks:
            chunk_start = c['start']
            chunk_end = chunk_start + self.chunk_len

            chunk_words = [
                transcription.Word(**wd).shift(time=chunk_start)
                for wd in c['words']
            ]

            # At chunk boundary cut points the audio often contains part of a
            # word, which can get erroneously identified as one or more different
            # in-vocabulary words.  So discard one or more words near the cut points
            # (they'll be covered by the ovlerap anyway).
            #
            trim = min(0.25 * self.overlap_t, 0.5)
            if c is not chunks[0]:
                while len(chunk_words) > 1:
                    chunk_words.pop(0)
                    if chunk_words[0].end > chunk_start + trim:
                        break
            if c is not chunks[-1]:
                while len(chunk_words) > 1:
                    chunk_words.pop()
                    if chunk_words[-1].start < chunk_end - trim:
                        break

            words.extend(chunk_words)

        # Remove overlap:  Sort by time, then filter out any Word entries in
        # the list that are adjacent to another entry corresponding to the same
        # word in the audio.
        words.sort(key=lambda word: word.start)
        words.append(transcription.Word(word="__dummy__"))
        words = [
            words[i] for i in range(len(words) - 1)
            if not words[i].corresponds(words[i + 1])
        ]

        return words
Esempio n. 5
0
    def realign(chunk):
        nonlocal ignored

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = final_end_t
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk["words"]), duration))
            ignored += 1
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        realign_transcript = chunk_transcript.decode("utf-8").replace(
            "\n", " ")
        logging.debug("realign transcript: %s", realign_transcript)
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        wav_obj = wave.open(wavfile, "rb")
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))
        wav_obj.close()

        retries = 5
        while retries > 0:
            try:
                k = standard_kaldi.Kaldi(
                    resources.nnet_gpu_path,
                    chunk_gen_hclg_filename,
                    resources.proto_langdir,
                )
                k.push_chunk(buf)
                ret = [transcription.Word(**wd) for wd in k.get_final()]
                k.stop()
                break
            except BrokenPipeError:
                retries -= 1
                if retries == 0:
                    raise

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({
                "percent":
                (ignored + len(realignments)) / float(len(to_realign))
            })