コード例 #1
0
def transcribe_many_parallel(args, filepaths):
    for index, filepath in filepaths:
        ds = Model(args.model)

        if args.beam_width:
            ds.setBeamWidth(args.beam_width)

        if args.scorer:
            print('Loading scorer from files {}'.format(args.scorer),
                  file=sys.stderr)
            scorer_load_start = timer()
            ds.enableExternalScorer(args.scorer)
            scorer_load_end = timer() - scorer_load_start
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
                  file=sys.stderr)

            if args.lm_alpha and args.lm_beta:
                ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

        if args.hot_words:
            print('Adding hot-words', file=sys.stderr)
            for word_boost in args.hot_words.split(','):
                word, boost = word_boost.split(':')
                ds.addHotWord(word, float(boost))
        p = Process(target=transcribe_file, args=(args, ds, filepath, index))
        p.start()
        p.join()
        print('{}: Transcribed file {} of {} from "{}"'.format(
            time.strftime("%H:%M:%S", time.localtime()), index + 1,
            len(filepaths), filepath))
コード例 #2
0
def ModelInitiate(model_file_path, lm_file_path, lm_alpha, lm_beta, beam_width):
  model = Model(model_file_path)
  model.enableExternalScorer(lm_file_path)

  model.setScorerAlphaBeta(lm_alpha, lm_beta)
  model.setBeamWidth(beam_width)
  return model
コード例 #3
0
def create_deepspeech_model(args):
    ds = Model(args.model)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer),
              file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
              file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print('Adding hot-words', file=sys.stderr)
        for word_boost in args.hot_words.split(','):
            word, boost = word_boost.split(':')
            ds.addHotWord(word, float(boost))

    return ds
コード例 #4
0
def run():
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser(description='DeepSpeech Server')
    parser.add_argument('--port',
                        default=3337,
                        type=int,
                        help='Port to listen on')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer',
                        required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--beam_width',
                        type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument(
        '--lm_alpha',
        type=float,
        help=
        'Language model weight (lm_alpha). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--lm_beta',
        type=float,
        help=
        'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.'
    )
    parser.add_argument('--google_key',
                        help="Google Speech-Recognition API key.")
    args = parser.parse_args()

    ds = Model(args.model)
    if args.beam_width:
        ds.setBeamWidth(args.beam_width)
    ds.enableExternalScorer(args.scorer)
    if args.lm_alpha and args.lm_beta:
        ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)
    handler_class = ReqHandlerFactory(ds, args.google_key)

    server_address = ('', args.port)
    httpd = HTTPServer(server_address, handler_class)
    logging.info('Starting httpd...\n')
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    httpd.server_close()
    logging.info('Stopping httpd...\n')
コード例 #5
0
ファイル: transcript.py プロジェクト: mhannani/Esup-Pod
def get_model(lang):
    ds_model = Model(DS_PARAM[lang]['model'])
    if DS_PARAM[lang].get('beam_width'):
        ds_model.setBeamWidth(DS_PARAM[lang]['beam_width'])
    if DS_PARAM[lang].get('scorer'):
        print('Loading scorer from files {}'.format(DS_PARAM[lang]['scorer']),
              file=sys.stderr)
        scorer_load_start = timer()
        ds_model.enableExternalScorer(DS_PARAM[lang]['scorer'])
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
              file=sys.stderr)
        if DS_PARAM[lang].get('lm_alpha') and DS_PARAM[lang].get('lm_beta'):
            ds_model.setScorerAlphaBeta(DS_PARAM[lang]['lm_alpha'],
                                        DS_PARAM[lang]['lm_beta'])
    return ds_model
コード例 #6
0
def get_model(lang):
    ds_model = Model(DS_PARAM[lang]["model"])
    if DS_PARAM[lang].get("beam_width"):
        ds_model.setBeamWidth(DS_PARAM[lang]["beam_width"])
    if DS_PARAM[lang].get("scorer"):
        print(
            "Loading scorer from files {}".format(DS_PARAM[lang]["scorer"]),
            file=sys.stderr,
        )
        scorer_load_start = timer()
        ds_model.enableExternalScorer(DS_PARAM[lang]["scorer"])
        scorer_load_end = timer() - scorer_load_start
        print("Loaded scorer in {:.3}s.".format(scorer_load_end),
              file=sys.stderr)
        if DS_PARAM[lang].get("lm_alpha") and DS_PARAM[lang].get("lm_beta"):
            ds_model.setScorerAlphaBeta(DS_PARAM[lang]["lm_alpha"],
                                        DS_PARAM[lang]["lm_beta"])
    return ds_model
コード例 #7
0
def stt(model_path,
        audio,
        beam_width=None,
        scorer_path=None,
        lm_alpha=None,
        lm_beta=None,
        hot_words=None):
    ds = Model(model_path)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer_path:
        ds.enableExternalScorer(scorer_path)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    # TODO
    # if hot_words:
    #     print('Adding hot-words', file=sys.stderr)
    #     for w in hot_words:
    #         ds.addHotWord(w, 6.2)

    fin = wave.open(audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.',
            file=sys.stderr)
        exit(1)

    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    fin.close()

    print('Running inference.', file=sys.stderr)
    res = ds.sttWithMetadata(audio, 1)
    res = postprocess_metadata(res)
    return res
コード例 #8
0
def predict_speech_to_text(stream_file):
    alpha = 0.85
    beta = 1.75
    beam_width = 500

    # Initialize the model
    speech_model = Model(MODEL_PATH)

    # Enable language scorer to improve the accuracy
    speech_model.enableExternalScorer(SCORER_PATH)

    #set beam width. A larger beam width value generates better results at the cost of decoding time.
    speech_model.setBeamWidth(beam_width)

    # setting the Scorer language model weight (alpha) and word insertion weight(beta)
    speech_model.setScorerAlphaBeta(alpha, beta)

    # Use scipy to covert wav file into numpy array
    _, audio = wav.read(stream_file)
    return speech_model.stt(audio)
コード例 #9
0
ファイル: speech_to_text.py プロジェクト: jonaslimads/robot
    def load_deepspeech_model(self):
        model = os.path.join(self.deepspeech_models_folder,
                             "deepspeech-0.9.3-models.pbmm")
        scorer = os.path.join(self.deepspeech_models_folder,
                              "deepspeech-0.9.3-models.scorer")
        lm_alpha = 0.93
        lm_beta = 1.18
        beam_width = 100

        model_load_start = timer()
        deepspeech_model = Model(model)
        model_load_end = timer() - model_load_start
        logger.debug("Loaded model in %0.3fs." % (model_load_end))
        scorer_load_start = timer()

        deepspeech_model.enableExternalScorer(scorer)
        deepspeech_model.setScorerAlphaBeta(lm_alpha, lm_beta)
        deepspeech_model.setBeamWidth(beam_width)

        scorer_load_end = timer() - scorer_load_start
        logger.debug("Loaded external scorer in %0.3fs." % (scorer_load_end))

        return deepspeech_model
コード例 #10
0
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""):
    """ Load models"""

    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    if verbose==True:
        print('\nLoading model from files {}'.format(model), file=sys.stderr)
        print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer:
        if verbose == True:
            print('Loading scorer from files {}'.format(scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(scorer)
        scorer_load_end = timer() - scorer_load_start
        if verbose == True:
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    if hot_words:
        if verbose == True:
            print('Adding hot-words', file=sys.stderr)
        for word_boost in hot_words.split(','):
            word, boost = word_boost.split(':')
            ds.addHotWord(word, float(boost))
    return ds, desired_sample_rate
コード例 #11
0
def MozillaSTT(audio_path):

    # TODO: handle different rates (not implemented)
    fin = wave.open(audio_path, 'rb')
    output = ""
    # print("SS")
    ds = Model(model_file_path)
    # print("SS")
    ds.enableExternalScorer(scorer_file_path)
    # print("SS")

    lm_alpha = 0.75  # ??
    lm_beta = 1.85
    desired_sample_rate = ds.sampleRate()
    ds.setScorerAlphaBeta(lm_alpha, lm_beta)
    fs_orig = fin.getframerate()
    # print("Desired Sampling Rate: %d", desired_sample_rate)
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. \
Resampling might produce erratic speech   recognition.'.format(
            fs_orig, desired_sample_rate),
              file=sys.stderr)
        fs_new, audio = convert_samplerate(audio_path, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    # audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()
    print('Running inference.', file=sys.stderr)
    # print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    # print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
    # print(ds.stt(audio))
    output += ds.stt(audio)
    output += '\n'
    output += metadata_json_output(ds.sttWithMetadata(audio, 3))
    return output
コード例 #12
0
    def __init__(self, ):

        print('Loading model from file {}'.format(args.model), file=sys.stderr)
        model_load_start = timer()
        # sphinx-doc: python_ref_model_start
        model_path = os.path.dirname(os.path.abspath(__file__))

        ds = Model(os.path.join(model_path, args.model))
        # sphinx-doc: python_ref_model_stop
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(model_load_end),
              file=sys.stderr)

        if args.beam_width:
            ds.setBeamWidth(args.beam_width)

        self.desired_sample_rate = ds.sampleRate()

        if args.scorer:
            print('Loading scorer from files {}'.format(args.scorer),
                  file=sys.stderr)
            scorer_load_start = timer()
            ds.enableExternalScorer(os.path.join(model_path, args.scorer))
            scorer_load_end = timer() - scorer_load_start
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
                  file=sys.stderr)

            if args.lm_alpha and args.lm_beta:
                ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

        if args.hot_words:
            print('Adding hot-words', file=sys.stderr)
            for word_boost in args.hot_words.split(','):
                word, boost = word_boost.split(':')
                ds.addHotWord(word, float(boost))
        self.ds = ds
コード例 #13
0
        def setup_model(model_path, scorer, beam_width):
            log("creating model {} with scorer {}...".format(model_path, scorer))
            model = Model(model_path)

            if scorer.scorer is not None:
                model.enableExternalScorer(scorer.scorer)
                if scorer.lm_alpha is not None and scorer.lm_beta is not None:
                    if model.setScorerAlphaBeta(scorer.lm_alpha, scorer.lm_beta) != 0:
                        raise RuntimeError("Unable to set scorer parameters")

            if beam_width is not None:
                if model.setBeamWidth(beam_width) != 0:
                    raise RuntimeError("Unable to set beam width")

            log("model is ready.")
            return model
コード例 #14
0
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float,
                       lm_beta: float, beam: int):
    class AudioProcessor(AudioProcessorBase):
        frames_lock: threading.Lock
        frames: deque

        def __init__(self) -> None:
            self.frames_lock = threading.Lock()
            self.frames = deque([])

        async def recv_queued(self,
                              frames: List[av.AudioFrame]) -> av.AudioFrame:
            with self.frames_lock:
                self.frames.extend(frames)

            # Return empty frames to be silent.
            new_frames = []
            for frame in frames:
                input_array = frame.to_ndarray()
                new_frame = av.AudioFrame.from_ndarray(
                    np.zeros(input_array.shape, dtype=input_array.dtype),
                    layout=frame.layout.name,
                )
                new_frame.sample_rate = frame.sample_rate
                new_frames.append(new_frame)

            return new_frames

    webrtc_ctx = webrtc_streamer(
        key="speech-to-text-w-video",
        mode=WebRtcMode.SENDRECV,
        audio_processor_factory=AudioProcessor,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": True,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_processor:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()

            audio_frames = []
            with webrtc_ctx.audio_processor.frames_lock:
                while len(webrtc_ctx.audio_processor.frames) > 0:
                    frame = webrtc_ctx.audio_processor.frames.popleft()
                    audio_frames.append(frame)

            if len(audio_frames) == 0:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
コード例 #15
0
ファイル: predict.py プロジェクト: acupofjose/tensorflow
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer',
                        required=False,
                        help='Path to the external scorer file')
    parser.add_argument(
        '--prediction_in',
        required=True,
        help='Path to the directory with sound files (mp3/ogg/wav) to analyze')
    parser.add_argument(
        '--prediction_out',
        required=True,
        help='Path to the directory for moving the processed sound files to')
    parser.add_argument(
        '--prediction_tmp',
        required=False,
        help=
        'Path to the temp directory for storing the predictions initially before moving them to "--prediction_out"'
    )
    parser.add_argument(
        '--continuous',
        action='store_true',
        help='Whether to continuously load test images and perform prediction',
        required=False,
        default=False)
    parser.add_argument(
        '--delete_input',
        action='store_true',
        help=
        'Whether to delete the input files rather than move them to "--prediction_out" directory',
        required=False,
        default=False)
    parser.add_argument('--beam_width',
                        type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument(
        '--lm_alpha',
        type=float,
        help=
        'Language model weight (lm_alpha). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--lm_beta',
        type=float,
        help=
        'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--json',
        required=False,
        action='store_true',
        help='Output json from metadata with timestamp of each word')
    parser.add_argument(
        '--candidate_transcripts',
        type=int,
        default=3,
        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument(
        '--normalize',
        required=False,
        action='store_true',
        help='Whether to apply standard amplitude normalization')
    parsed = parser.parse_args()

    print('Loading model from file {}'.format(parsed.model))
    ds = Model(parsed.model)
    if parsed.beam_width:
        ds.setBeamWidth(parsed.beam_width)

    if parsed.scorer:
        print('Loading scorer from file {}'.format(parsed.scorer))
        ds.enableExternalScorer(parsed.scorer)
        if parsed.lm_alpha and parsed.lm_beta:
            ds.setScorerAlphaBeta(parsed.lm_alpha, parsed.lm_beta)

    process(model=ds,
            prediction_in=parsed.prediction_in,
            prediction_out=parsed.prediction_out,
            prediction_tmp=parsed.prediction_tmp,
            continuous=parsed.continuous,
            delete_input=parsed.delete_input,
            json=parsed.json,
            candidate_transcripts=parsed.candidate_transcripts,
            normalize=parsed.normalize)
コード例 #16
0
ファイル: client.py プロジェクト: KScorcia/DeepSpeech
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    args = parser.parse_args()

#     print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
#     print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
#         print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
#         print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
#         print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

#     print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else:
        print("Translation: "+ds.stt(audio))
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
コード例 #17
0
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float,
            beam: int):
    webrtc_ctx = webrtc_streamer(
        key="speech-to-text",
        mode=WebRtcMode.SENDONLY,
        audio_receiver_size=1024,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": False,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_receiver:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()
            try:
                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
            except queue.Empty:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
コード例 #18
0
from deepspeech import Model
for i in range(5):
    ds = Model('/models/mozilla/deepspeech-0.7.3-models.pbmm')
    ds.enableExternalScorer('/models/mozilla/deepspeech-0.7.3-models.scorer')
    ds.setScorerAlphaBeta(0.75, 1.85)
    ds.__del__()
    def record_voice_and_predict_text(self):
        """Records the speech and predicts its text """
        #Recording the speech

        stream_file_name = 'AudioFile/speech_stream.wav'
        stream_format = pyaudio.paInt16  # Sampling size and format
        no_of_channels = 1  # Number of audio channels
        sampling_rate = 16000  # Sampling rate in Hertz
        frames_count = 1024  # Number of frames per buffer
        record_seconds = 5

        stream = pyaudio.PyAudio()

        stream_data = stream.open(format=stream_format,
                                  channels=no_of_channels,
                                  rate=sampling_rate,
                                  input=True,
                                  frames_per_buffer=frames_count)
        frames = [
            stream_data.read(frames_count)
            for i in range(0, int(sampling_rate / frames_count *
                                  record_seconds))
        ]
        stream_data.stop_stream()
        stream_data.close()
        stream.terminate()

        wave_file = wave.open(stream_file_name, 'wb')
        wave_file.setnchannels(no_of_channels)
        wave_file.setsampwidth(stream.get_sample_size(stream_format))
        wave_file.setframerate(sampling_rate)
        wave_file.writeframes(b''.join(frames))
        wave_file.close()

        try:
            self.label_info.setText('Recording completed.')
        except:
            pass

        #Text prediction Part
        alpha = 0.75
        beta = 1.85
        beam_width = 500

        # Initialize the model
        speech_model = Model(MODEL_PATH)

        # set beam width. A larger beam width value generates better results at the cost of decoding time.
        speech_model.setBeamWidth(beam_width)

        # Enable language scorer to improve the accuracy
        speech_model.enableExternalScorer(SCORER_PATH)
        # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight

        # Set hyperparameters alpha and beta of the external scorer.
        # alpha: Language model weight.
        # beta: Word insertion weight
        speech_model.setScorerAlphaBeta(alpha, beta)

        # Use scipy to covert wav file into numpy array
        _, audio = wav.read(stream_file_name)
        text = speech_model.stt(audio)
        try:
            self.text_pred.setText(text)
        except:
            pass
        show_images(text)
コード例 #20
0
ファイル: client.py プロジェクト: TeHikuMedia/DeepSpeech
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    # parser.add_argument('--version', action=VersionAction,
    #                     help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument('--hot_words', type=str,
                        help='Hot-words and their boosts.')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print('Adding hot-words', file=sys.stderr)
        for word_boost in args.hot_words.split(','):
            word,boost = word_boost.split(':')
            ds.addHotWord(word,float(boost))

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else: 
        print(ds.stt(audio))        
        test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou")
        [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens]
        
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
コード例 #21
0
ファイル: asr.py プロジェクト: shwetagargade216/ASR
model_load_start = timer()
ds = Model(model_path)
model_load_end = timer() - model_load_start
print('Loaded scorer in: ', model_load_end)

desired_sample_rate = ds.sampleRate()

if scorer:
    print('Loading scorer from files {}'.format(scorer), file=sys.stderr)
    scorer_load_start = timer()
    ds.enableExternalScorer(scorer)
    scorer_load_end = timer() - scorer_load_start
    print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

    if lm_alpha and lm_beta:
        ds.setScorerAlphaBeta(lm_alpha, lm_beta)


def speech(audio):
    fin = wave.open(audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
            .format(fs_orig, desired_sample_rate),
            file=sys.stderr)
        fs_new, audio = convert_samplerate(audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs_orig)
コード例 #22
0
from deepspeech import Model
import gradio as gr
import numpy as np

model_file_path = "deepspeech-0.8.2-models.pbmm"
lm_file_path = "deepspeech-0.8.2-models.scorer"
beam_width = 100
lm_alpha = 0.93
lm_beta = 1.18

model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)
model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)


def reformat_freq(sr, y):
    if sr not in (
            48000,
            16000,
    ):  # Deepspeech only supports 16k, (we convert 48k -> 16k)
        raise ValueError("Unsupported rate", sr)
    if sr == 48000:
        y = (((y / max(np.max(y), 1)) * 32767).reshape(
            (-1, 3)).mean(axis=1).astype("int16"))
        sr = 16000
    return sr, y


def transcribe(speech, stream):
    _, y = reformat_freq(*speech)