Esempio n. 1
0
    def transcribe(self, wavfile, progress_cb=None, logging=None):
        words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb)

        # Clear queue (would this be gc'ed?)
        for i in range(self.nthreads):
            k = self.queue.get()
            k.stop()

        # Align words
        words = diff_align.align(words, self.ms, **self.kwargs)

        # Perform a second-pass with unaligned words
        if logging is not None:
            logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words)))

        if progress_cb is not None:
            progress_cb({'status': 'ALIGNING'})

        words = multipass.realign(wavfile, words, self.ms, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb)

        if logging is not None:
            logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words)))

        words = AdjacencyOptimizer(words, duration).optimize()

        return Transcription(words=words, transcript=self.transcript)
Esempio n. 2
0
    def realign(chunk):
        wav_obj = wave.open(wavfile, 'r')

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        if duration < 0.01 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0].startOffset
        chunk_len = chunk['words'][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        wav_obj = wave.open(wavfile, 'r')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        # Adjust startOffset, endOffset, and timing to match originals
        for wd in word_alignment:
            if wd.end is not None:
                # Apply timing offset
                wd.start += start_t
                wd.end += start_t

            if wd.endOffset is not None:
                wd.startOffset += offset_offset
                wd.endOffset += offset_offset

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
Esempio n. 3
0
    def realign_sub(chunk):
        with wave.open(wavfile, "rb") as wav_obj:
            if chunk["start"] is None:
                start_t = 0
            else:
                start_t = chunk["start"].end

            if chunk["end"] is None:
                end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
            else:
                end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug(
                "cannot realign %d words with duration %f",
                len(chunk["words"]),
                duration,
            )
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        with wave.open(wavfile, "rb") as wav_obj:
            wav_obj.setpos(int(start_t * wav_obj.getframerate()))
            buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
Esempio n. 4
0
    def realign(chunk):
        wav_obj = wave.open(wavfile, 'r')

        start_t = (chunk["start"] or {"end": 0})["end"]
        end_t = chunk["end"]
        if end_t is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = end_t["start"]

        duration = end_t - start_t
        if duration < 0.01 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0]['startOffset']
        chunk_len = chunk['words'][-1]['endOffset'] - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, proto_langdir)
        k = standard_kaldi.Kaldi(
            get_resource('data/nnet_a_gpu_online'),
            chunk_gen_hclg_filename,
            proto_langdir)

        wav_obj = wave.open(wavfile, 'r')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = k.get_final()
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        # Adjust startOffset, endOffset, and timing to match originals
        for wd in word_alignment:
            if wd.get("end"):
                # Apply timing offset
                wd['start'] += start_t
                wd['end'] += start_t

            if wd.get("endOffset"):
                wd['startOffset'] += offset_offset
                wd['endOffset'] += offset_offset

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({"percent": len(realignments) / float(len(to_realign))})
Esempio n. 5
0
    def realign(chunk):
        wav_obj = wave.open(wavfile, 'rb')

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0].startOffset
        chunk_len = chunk['words'][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(
            resources.nnet_gpu_path,
            chunk_gen_hclg_filename,
            resources.proto_langdir)

        wav_obj = wave.open(wavfile, 'rb')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({"percent": len(realignments) / float(len(to_realign))})
Esempio n. 6
0
    def transcribe(self, wavfile, progress_cb=None, logging=None):
        #print("self.mtt.transcribe started")
        #print(wavfile)
        words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb)
        #print("self.mtt.transcribe ended")

        # Clear queue (would this be gc'ed?)
        #print("Threading")
        for i in range(self.nthreads):
            k = self.queue.get()
            k.stop()

        # Align words
        #print("Align words")
        words = diff_align.align(words, self.ms, **self.kwargs)

        # Perform a second-pass with unaligned words
        if logging is not None:
            logging.info(
                "%d unaligned words (of %d)" %
                (len([X
                      for X in words if X.not_found_in_audio()]), len(words)))
            #print("%d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words)))

        if progress_cb is not None:
            progress_cb({'status': 'ALIGNING'})

        words = multipass.realign(wavfile,
                                  words,
                                  self.ms,
                                  resources=self.resources,
                                  nthreads=self.nthreads,
                                  progress_cb=progress_cb)

        if logging is not None:
            logging.info(
                "after 2nd pass: %d unaligned words (of %d)" %
                (len([X
                      for X in words if X.not_found_in_audio()]), len(words)))
            #print("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words)))

        #print("Start to optimize")
        words = AdjacencyOptimizer(words, duration).optimize()
        #print("Optimized")

        return Transcription(words=words, transcript=self.transcript)
Esempio n. 7
0
    def transcribe(self, wavfile, progress_cb=None, logging=None):
        words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb)

        # Clear queue (would this be gc'ed?)
        for i in range(self.nthreads):
            try:
                k = self.queue.get(block=False)
                del k
            except Empty:
                continue

        # Align words
        words = diff_align.align(words, self.ms, **self.kwargs)

        # Perform a second-pass with unaligned words
        if logging is not None:
            logging.info(
                "%d unaligned words (of %d)" %
                (len([X
                      for X in words if X.not_found_in_audio()]), len(words)))

        if progress_cb is not None:
            progress_cb({'status': 'ALIGNING'})

        words = multipass.realign(wavfile,
                                  words,
                                  self.ms,
                                  resources=self.resources,
                                  nthreads=self.nthreads,
                                  progress_cb=progress_cb)

        if logging is not None:
            logging.info(
                "after 2nd pass: %d unaligned words (of %d)" %
                (len([X
                      for X in words if X.not_found_in_audio()]), len(words)))

        words = AdjacencyOptimizer(words, duration).optimize()

        return Transcription(words=words, transcript=self.transcript)
Esempio n. 8
0
    def transcribe(self, wavfile, progress_cb=None, logging=None):
        words = self.mtt.transcribe(wavfile, progress_cb=progress_cb)

        # Clear queue (would this be gc'ed?)
        for i in range(self.nthreads):
            k = self.queue.get()
            k.stop()

        # Align words
        words = diff_align.align(words, self.ms, **self.kwargs)

        # Perform a second-pass with unaligned words
        if logging is not None:
            logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.case == "not-found-in-audio"]), len(words)))

        if progress_cb is not None:
            progress_cb({'status': 'ALIGNING'})

        words = multipass.realign(wavfile, words, self.ms, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb)

        if logging is not None:
            logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.case == "not-found-in-audio"]), len(words)))

        return Transcription(words=words, transcript=self.transcript)
Esempio n. 9
0
    def realign(chunk):
        nonlocal ignored

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = final_end_t
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk["words"]), duration))
            ignored += 1
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        realign_transcript = chunk_transcript.decode("utf-8").replace(
            "\n", " ")
        logging.debug("realign transcript: %s", realign_transcript)
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        wav_obj = wave.open(wavfile, "rb")
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))
        wav_obj.close()

        retries = 5
        while retries > 0:
            try:
                k = standard_kaldi.Kaldi(
                    resources.nnet_gpu_path,
                    chunk_gen_hclg_filename,
                    resources.proto_langdir,
                )
                k.push_chunk(buf)
                ret = [transcription.Word(**wd) for wd in k.get_final()]
                k.stop()
                break
            except BrokenPipeError:
                retries -= 1
                if retries == 0:
                    raise

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({
                "percent":
                (ignored + len(realignments)) / float(len(to_realign))
            })
Esempio n. 10
0
    def transcribe(self, uid, transcript, audio, async):

        proto_langdir = get_resource('PROTO_LANGDIR')
        
        status = self.get_status(uid)

        status['status'] = 'STARTED'
        output = {
            'transcript': transcript
        }

        outdir = os.path.join(self.data_dir, 'transcriptions', uid)                

        tran_path = os.path.join(outdir, 'transcript.txt')
        with codecs.open(tran_path, 'w', 'utf-8') as tranfile:
            tranfile.write(transcript)
        audio_path = os.path.join(outdir, 'upload')
        with open(audio_path, 'w') as wavfile:
            wavfile.write(audio)

        status['status'] = 'ENCODING'

        wavfile = os.path.join(outdir, 'a.wav')
        if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0:
            status['status'] = 'ERROR'
            status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        #XXX: Maybe we should pass this wave object instead of the
        # file path to align_progress
        wav_obj = wave.open(wavfile, 'r')
        status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate())
        status['status'] = 'TRANSCRIBING'

        def on_progress(p):
            for k,v in p.items():
                status[k] = v

        if len(transcript.strip()) > 0:
            ms = metasentence.MetaSentence(transcript, self.vocab)
            ks = ms.get_kaldi_sequence()
            gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir)

            kaldi_queue = Queue()
            for i in range(self.nthreads):
                kaldi_queue.put(standard_kaldi.Kaldi(
                    get_resource('data/nnet_a_gpu_online'),
                    gen_hclg_filename,
                    proto_langdir)
                )

            mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads)
        elif hasattr(self, 'full_transcriber'):
            mtt = self.full_transcriber
        else:
            status['status'] = 'ERROR'
            status['error']  = 'No transcript provided and no language model for full transcription'
            return

        words = mtt.transcribe(wavfile, progress_cb=on_progress)

        output = {}
        if len(transcript.strip()) > 0:
            # Clear queue (would this be gc'ed?)
            for i in range(self.nthreads):
                k = kaldi_queue.get()
                k.stop()

            # Align words
            output['words'] = diff_align.align(words, ms)
            output['transcript'] = transcript

            # Perform a second-pass with unaligned words
            logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))

            status['status'] = 'ALIGNING'

            output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress)

            logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))
            
        else:
            # Match format
            output = make_transcription_alignment({"words": words})

        # ...remove the original upload
        os.unlink(os.path.join(outdir, 'upload'))

        # Save
        with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
            json.dump(output, jsfile, indent=2)
        with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
            csvfile.write(to_csv(output))

        # Inline the alignment into the index.html file.
        htmltxt = open(get_resource('www/view_alignment.html')).read()
        htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output)));
        open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)

        status['status'] = 'OK'

        logging.info('done with transcription.')

        return output