Exemple #1
0
 def modelResult(self, dirName):
     # Fetch and Resolve all the paths of model files
     output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(
         dirName)
     # Load output_graph, alpahbet, lm and trie
     self.model = wavTranscriber.load_model(output_graph, alphabet, lm,
                                            trie)
Exemple #2
0
def test_ds_inst(model_path):

    # threadName = threading.currentThread().name
    processName = multiprocessing.current_process().name
    sample_rate = 16000
    # Point to a path containing the pre-trained models & resolve ~ if used
    dirName = os.path.expanduser(args.model)
    # print(dirName)

    # Resolve all the paths of model files
    output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName)
    # print(output_graph, alphabet, lm, trie)

    # Load output_graph, alpahbet, lm and trie
    model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie)
    # print(threadName + ': Model loded . . . ')

    _model_loded.put(processName)

    run_ds = False
    with _read_write_i_audio_sema:
        run_ds = check_i_audio()

    while run_ds:
        # audio, sample_rate, audio_length = wavTranscriber.read_wave('audio.wav')

        with _fetch_audio_sema:
            audio = fetch_audio()

        # print(threadName + ':' + 'audio_queue_after_get:' + str(audio_queue.qsize()))
        audio = np.frombuffer(audio, dtype=np.int16)
        output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
        # print(threadName + ':' + output[0])
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (output[2], output[1]))
        print(i_audio, n_audio)

        with _read_write_audio_length_sema:
            audio_length_queue.put(output[1])

        with _read_write_inference_time_sema:
            inference_time_queue.put(output[2])

        with _read_write_i_audio_sema:
            run_ds = check_i_audio()

    print(processName + ' end')
Exemple #3
0
 def modelResult(self, dirName):
     # Fetch and Resolve all the paths of model files
     output_graph, scorer = wavTranscriber.resolve_models(dirName)
     # Load output_graph, alphabet and scorer
     self.model = wavTranscriber.load_model(output_graph, scorer)
def transcriptionProcess(aggressive, audioLoc, modelLoc):
    logging.debug("Transcribing audio file: %s" % audioLoc)

    # Point to a path containing the pre-trained models & resolve ~ if used
    dirName = os.path.expanduser(modelLoc)

    logging.debug("dirName: %s" % dirName)

    # Resolve all the paths of model files
    output_graph, scorer = wavTranscriber.resolve_models(dirName)

    # Load output_graph, alpahbet and scorer
    model_retval = wavTranscriber.load_model(output_graph, scorer)

    if audioLoc is not None:
        title_names = [
            'Filename', 'Duration(s)', 'Inference Time(s)',
            'Model Load Time(s)', 'Scorer Load Time(s)'
        ]
        print("\n%-30s %-20s %-20s %-20s %s" %
              (title_names[0], title_names[1], title_names[2], title_names[3],
               title_names[4]))

        inference_time = 0.0

        # Run VAD on the input file
        waveFile = audioLoc
        segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(
            waveFile, aggressive)
        transcripts = ""
        f = open(waveFile.rstrip(".wav") + ".txt", 'w')
        logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") +
                      ".txt")

        for i, segment in enumerate(segments):
            # Run deepspeech on the chunk that just completed VAD
            logging.debug("Processing chunk %002d" % (i, ))
            audio = np.frombuffer(segment, dtype=np.int16)
            output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
            inference_time += output[1]
            logging.debug("Transcript: %s" % output[0])

            transcripts += output[0]
            f.write(output[0] + " ")

        # Summary of the files processed
        f.close()

        # Extract filename from the full file path
        filename, ext = os.path.split(os.path.basename(waveFile))
        logging.debug(
            "************************************************************************************************************"
        )
        logging.debug("%-30s %-20s %-20s %-20s %s" %
                      (title_names[0], title_names[1], title_names[2],
                       title_names[3], title_names[4]))
        logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
                      (filename + ext, audio_length, inference_time,
                       model_retval[1], model_retval[2]))
        logging.debug(
            "************************************************************************************************************"
        )
        print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
              (filename + ext, audio_length, inference_time, model_retval[1],
               model_retval[2]))

        return transcripts
def main(args):
    parser = argparse.ArgumentParser(
        description=
        'Transcribe long audio files using webRTC VAD or use the streaming interface'
    )
    parser.add_argument(
        '--aggressive',
        type=int,
        choices=range(4),
        required=False,
        help=
        'Determines how aggressive filtering out non-speech is. (Interger between 0-3)'
    )
    parser.add_argument('--audio',
                        required=False,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument(
        '--model',
        required=True,
        help=
        'Path to directory that contains all model files (output_graph, lm and trie)'
    )
    parser.add_argument('--stream',
                        required=False,
                        action='store_true',
                        help='To use deepspeech streaming interface')
    args = parser.parse_args()
    if args.stream is True:
        print("Opening mic for streaming")
    elif args.audio is not None:
        logging.debug("Transcribing audio file @ %s" % args.audio)
    else:
        parser.print_help()
        parser.exit()

    # Point to a path containing the pre-trained models & resolve ~ if used
    dirName = os.path.expanduser(args.model)

    # Resolve all the paths of model files
    output_graph, lm, trie = wavTranscriber.resolve_models(dirName)

    # Load output_graph, alpahbet, lm and trie
    model_retval = wavTranscriber.load_model(output_graph, lm, trie)

    if args.audio is not None:
        title_names = [
            'Filename', 'Duration(s)', 'Inference Time(s)',
            'Model Load Time(s)', 'LM Load Time(s)'
        ]
        print("\n%-30s %-20s %-20s %-20s %s" %
              (title_names[0], title_names[1], title_names[2], title_names[3],
               title_names[4]))

        inference_time = 0.0

        # Run VAD on the input file
        waveFile = args.audio
        segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(
            waveFile, args.aggressive)
        f = open(waveFile.rstrip(".wav") + ".txt", 'w')
        logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") +
                      ".txt")

        for i, segment in enumerate(segments):
            # Run deepspeech on the chunk that just completed VAD
            logging.debug("Processing chunk %002d" % (i, ))
            audio = np.frombuffer(segment, dtype=np.int16)
            output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
            inference_time += output[1]
            logging.debug("Transcript: %s" % output[0])

            f.write(output[0] + " ")

        # Summary of the files processed
        f.close()

        # Extract filename from the full file path
        filename, ext = os.path.split(os.path.basename(waveFile))
        logging.debug(
            "************************************************************************************************************"
        )
        logging.debug("%-30s %-20s %-20s %-20s %s" %
                      (title_names[0], title_names[1], title_names[2],
                       title_names[3], title_names[4]))
        logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
                      (filename + ext, audio_length, inference_time,
                       model_retval[1], model_retval[2]))
        logging.debug(
            "************************************************************************************************************"
        )
        print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
              (filename + ext, audio_length, inference_time, model_retval[1],
               model_retval[2]))
    else:
        sctx = model_retval[0].createStream()
        subproc = subprocess.Popen(shlex.split(
            'rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
                                   stdout=subprocess.PIPE,
                                   bufsize=0)
        print('You can start speaking now. Press Control-C to stop recording.')

        try:
            while True:
                data = subproc.stdout.read(512)
                model_retval[0].feedAudioContent(sctx,
                                                 np.frombuffer(data, np.int16))
        except KeyboardInterrupt:
            print('Transcription: ', model_retval[0].finishStream(sctx))
            subproc.terminate()
            subproc.wait()
def main(args):
    parser = argparse.ArgumentParser(description='Transcribe long audio files using webRTC VAD or use the streaming interface')
    parser.add_argument('--aggressive', type=int, choices=range(4), required=False,
                        help='Determines how aggressive filtering out non-speech is. (Interger between 0-3)')
    parser.add_argument('--audio', required=False,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--model', required=True,
                        help='Path to directory that contains all model files (output_graph, lm, trie and alphabet)')
    parser.add_argument('--stream', required=False, action='store_true',
                        help='To use deepspeech streaming interface')
    args = parser.parse_args()
    if args.stream is True and len(sys.argv[1:]) == 3:
             print("Opening mic for streaming")
    elif args.audio is not None and len(sys.argv[1:]) == 6:
            logging.debug("Transcribing audio file @ %s" % args.audio)
    else:
        parser.print_help()
        parser.exit()

    # Point to a path containing the pre-trained models & resolve ~ if used
    dirName = os.path.expanduser(args.model)

    # Resolve all the paths of model files
    output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName)

    # Load output_graph, alpahbet, lm and trie
    model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie)

    if args.audio is not None:
        title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
        print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))

        inference_time = 0.0

        # Run VAD on the input file
        waveFile = args.audio
        segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, args.aggressive)
        f = open(waveFile.rstrip(".wav") + ".txt", 'w')
        logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") + ".txt")

        for i, segment in enumerate(segments):
            # Run deepspeech on the chunk that just completed VAD
            logging.debug("Processing chunk %002d" % (i,))
            audio = np.frombuffer(segment, dtype=np.int16)
            output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
            inference_time += output[1]
            logging.debug("Transcript: %s" % output[0])

            f.write(output[0] + " ")

        # Summary of the files processed
        f.close()

        # Extract filename from the full file path
        filename, ext = os.path.split(os.path.basename(waveFile))
        logging.debug("************************************************************************************************************")
        logging.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
        logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
        logging.debug("************************************************************************************************************")
        print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
    else:
        sctx = model_retval[0].setupStream()
        subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
                                   stdout=subprocess.PIPE,
                                   bufsize=0)
        print('You can start speaking now. Press Control-C to stop recording.')

        try:
            while True:
                data = subproc.stdout.read(512)
                model_retval[0].feedAudioContent(sctx, np.frombuffer(data, np.int16))
        except KeyboardInterrupt:
            print('Transcription: ', model_retval[0].finishStream(sctx))
            subproc.terminate()
            subproc.wait()
Exemple #7
0
def main(args):
    parser = argparse.ArgumentParser(
        description=
        'Transcribe long audio files using webRTC VAD or use the streaming interface'
    )
    parser.add_argument('audio',
                        type=str,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument(
        '--aggressive',
        type=int,
        choices=range(4),
        required=False,
        help=
        'Determines how aggressive filtering out non-speech is. (Interger between 0-3)'
    )
    parser.add_argument(
        '--model',
        required=False,
        help=
        'Path to directory that contains all model files (output_graph, lm, trie and alphabet)'
    )
    args = parser.parse_args()

    # Loading model
    model_dir = os.path.expanduser(args.model if args.model else 'models/en')
    output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(model_dir)
    model = wavTranscriber.load_model(output_graph, alphabet, lm, trie)

    title_names = [
        'Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)',
        'LM Load Time(s)'
    ]
    print("\n%-30s %-20s %-20s %-20s %s" %
          (title_names[0], title_names[1], title_names[2], title_names[3],
           title_names[4]))

    inference_time = 0.0

    # Run VAD on the input file
    wave_file = args.audio
    aggressiveness = int(args.aggressive) if args.aggressive else 3
    segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(
        wave_file, aggressiveness)
    f = open(wave_file.rstrip(".wav") + ".txt", 'w')
    logging.debug("Saving Transcript @: %s" % wave_file.rstrip(".wav") +
                  ".txt")

    for i, segment in enumerate(segments):
        # Run deepspeech on the chunk that just completed VAD
        logging.debug("Processing chunk %002d" % (i, ))
        audio = np.frombuffer(segment, dtype=np.int16)
        output = wavTranscriber.stt(model[0], audio, sample_rate)
        inference_time += output[1]
        logging.debug("Transcript: %s" % output[0])

        f.write(output[0] + " ")

    # Summary of the files processed
    f.close()

    # Extract filename from the full file path
    filename, ext = os.path.split(os.path.basename(wave_file))
    logging.debug(
        "************************************************************************************************************"
    )
    logging.debug("%-30s %-20s %-20s %-20s %s" %
                  (title_names[0], title_names[1], title_names[2],
                   title_names[3], title_names[4]))
    logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
                  (filename + ext, audio_length, inference_time,
                   model_retval[1], model_retval[2]))
    logging.debug(
        "************************************************************************************************************"
    )
    print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
          (filename + ext, audio_length, inference_time, model_retval[1],
           model_retval[2]))
Exemple #8
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--audio',
                        required=True,
                        help='Path to the audio file to run (WAV format)')
    args = parser.parse_args()

    # Run VAD on the input file
    waveFile = args.audio
    logger.debug("Loading wav file %s" % waveFile)
    segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(
        waveFile, 3)

    logger.debug("Processing speaker diarization")
    segments = speaker_diarization.diarize(segments)

    f = open(waveFile.replace(".wav", ".txt"), 'w')
    logger.debug("Processing speech recognition")

    # Point to a path containing the pre-trained models & resolve ~ if used
    model_path = os.path.expanduser("deepspeech/models/")
    # Resolve all the paths of model files
    output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(
        model_path)
    # Load output_graph, alpahbet, lm and trie
    model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie)

    inference_time = 0.0
    for i, segment in enumerate(segments):
        # Run deepspeech on the chunk that just completed VAD
        logger.debug("[Speech recognition] Processing chunk %002d" % (i, ))
        audio = np.frombuffer(segment.bytes, dtype=np.int16)
        output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
        inference_time += output[1]
        logger.debug("[Speech recognition] Transcript: %s" % output[0])

        f.write("%s - %s Speaker %s: %s\n" %
                (str(datetime.timedelta(seconds=round(segment.begin, 3)))[:-3],
                 str(datetime.timedelta(seconds=round(segment.end, 3)))[:-3],
                 segment.speaker, output[0]))

    # Summary of the files processed
    f.close()
    logger.debug("Saved transcript @: %s" % waveFile.replace(".wav", ".txt"))

    # Extract filename from the full file path
    filename, ext = os.path.split(os.path.basename(waveFile))
    title_names = [
        'Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)',
        'LM Load Time(s)'
    ]
    logger.debug(
        "************************************************************************************************************"
    )
    logger.debug("%-30s %-20s %-20s %-20s %s" %
                 (title_names[0], title_names[1], title_names[2],
                  title_names[3], title_names[4]))
    logger.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
                 (filename + ext, audio_length, inference_time,
                  model_retval[1], model_retval[2]))
    logger.debug(
        "************************************************************************************************************"
    )

    print("\n%-30s %-20s %-20s %-20s %s" %
          (title_names[0], title_names[1], title_names[2], title_names[3],
           title_names[4]))
    print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
          (filename + ext, audio_length, inference_time, model_retval[1],
           model_retval[2]))
Exemple #9
0
def init_stt(output_graph_path, alphabet_path, lm_path, trie_path, rate):
    global model, sample_rate
    sample_rate = rate
    logging.debug('Process {}: Loaded models'.format(os.getpid()))
    model = wavTranscriber.load_model(output_graph_path, alphabet_path,
                                      lm_path, trie_path)
 def modelResult(self, dirName):
     # Fetch and Resolve all the paths of model files
     output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(dirName)
     # Load output_graph, alpahbet, lm and trie
     self.model = wavTranscriber.load_model(output_graph, alphabet, lm, trie)
    def runDeepspeech(self, waveFile, progress_callback):
        # Deepspeech will be run from this method
        logging.debug("Preparing for transcription...")

        # Go and fetch the models from the directory specified
        if self.dirName:
            # Resolve all the paths of model files
            output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(
                self.dirName)
        else:
            logging.critical(
                "*****************************************************")
            logging.critical("Model path not specified..")
            logging.critical("You sure of what you're doing ?? ")
            logging.critical("Trying to fetch from present working directory.")
            logging.critical(
                "*****************************************************")
            return "Transcription Failed, models path not specified"

        # Load output_graph, alpahbet, lm and trie
        model_retval = wavTranscriber.load_model(output_graph, alphabet, lm,
                                                 trie)
        inference_time = 0.0

        # Run VAD on the input file
        segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(
            waveFile, 1)
        f = open(waveFile.rstrip(".wav") + ".txt", 'w')
        logging.debug("Saving Transcript @: %s" % waveFile.rstrip(".wav") +
                      ".txt")

        for i, segment in enumerate(segments):
            # Run deepspeech on the chunk that just completed VAD
            logging.debug("Processing chunk %002d" % (i, ))
            audio = np.frombuffer(segment, dtype=np.int16)
            output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
            inference_time += output[1]

            f.write(output[0] + " ")
            progress_callback.emit(output[0] + " ")

        # Summary of the files processed
        f.close()

        # Format pretty, extract filename from the full file path
        filename, ext = os.path.split(os.path.basename(waveFile))
        title_names = [
            'Filename', 'Duration(s)', 'Inference Time(s)',
            'Model Load Time(s)', 'LM Load Time(s)'
        ]
        logging.debug(
            "************************************************************************************************************"
        )
        logging.debug("%-30s %-20s %-20s %-20s %s" %
                      (title_names[0], title_names[1], title_names[2],
                       title_names[3], title_names[4]))
        logging.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
                      (filename + ext, audio_length, inference_time,
                       model_retval[1], model_retval[2]))
        logging.debug(
            "************************************************************************************************************"
        )
        print("\n%-30s %-20s %-20s %-20s %s" %
              (title_names[0], title_names[1], title_names[2], title_names[3],
               title_names[4]))
        print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" %
              (filename + ext, audio_length, inference_time, model_retval[1],
               model_retval[2]))

        return "\n*********************\nTranscription Done..."