Ejemplo n.º 1
0
def vad_segment_generator(wavFile, aggressiveness):
    logging.debug("Caught the wav file @: %s" % (wavFile))
    audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
    assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
    #vad = webrtcvad.Vad(int(aggressiveness))
    frames = list(wavSplit.frame_generator(20000, audio, sample_rate))

    return frames
Ejemplo n.º 2
0
def vad_segment_generator(wavFile, aggressiveness):
    logging.debug("Caught the wav file @: %s" % (wavFile))
    audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = wavSplit.frame_generator(10, audio, sample_rate)
    frames = list(frames)
    segments = wavSplit.vad_collector(sample_rate, 10, 300, vad, frames)
    return segments, sample_rate, audio_length
Ejemplo n.º 3
0
def vad_segment_generator(wavFile, aggressiveness, frame_duration_ms=30, padding_duration_ms=300):
    logging.debug("Caught the wav file @: %s" % (wavFile))
    audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
    assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = wavSplit.frame_generator(frame_duration_ms, audio, sample_rate)
    frames = list(frames)
    segments = wavSplit.vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames)

    return [segment for segment in segments], sample_rate, audio_length
Ejemplo n.º 4
0
def vad_segment_generator(wavFile, aggressiveness):
    logging.debug("Caught the wav file @: %s" % (wavFile))
    audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
    assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = wavSplit.frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames)

    return segments, sample_rate, audio_length
Ejemplo n.º 5
0
def vad_segment_generator(wavFile, aggressiveness, model_sample_rate):
    logging.debug("Caught the wav file @: %s" % (wavFile))
    audio, sample_rate, audio_length = wavSplit.read_wave(wavFile)
    assert sample_rate == model_sample_rate, \
        "Audio sample rate must match sample rate of used model: {}Hz".format(model_sample_rate)
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = wavSplit.frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames)

    return segments, sample_rate, audio_length
Ejemplo n.º 6
0
def vad_segment_generator(wav_file, aggressiveness):
    """
    Generate VAD segments. Filters out non-voiced audio frames.
    :param wav_file: Input wav file to run VAD on.0
    :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3)
    :return: Returns tuple of
        segments: a bytearray of multiple smaller audio frames
                  (The longer audio split into multiple smaller one's)
        sample_rate: Sample rate of the input audio file
        audio_length: Duration of the input audio file
    """
    logging.debug("Caught the wav file @: %s" % wav_file)
    audio, sample_rate, audio_length = wavSplit.read_wave(wav_file)
    assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = wavSplit.frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = wavSplit.vad_collector(sample_rate, 30, 300, 0.5, vad, frames)

    return segments, sample_rate, audio_length
Ejemplo n.º 7
0
    ds.enableExternalScorer(scorer)
    # ds === deep speech model

    # load and pre-process audio
    audio_file = os.path.join(os.getcwd(),
                              "data/testing/audio/2830-3980-0043.wav")
    # audio_file = os.path.join(os.getcwd(),"data/testing/audio/my_name_is_jamie.wav")
    # audio_file = os.path.join(os.getcwd(),"data/testing/audio/hello_liv.wav")
    aggressiveness = 0

    print("Reading and processing: {}".format(audio_file))

    audio, sample_rate, audio_length = wavSplit.read_wave(audio_file)
    assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!"
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = wavSplit.frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames)

    # we now have the data in the following segments
    # segments, sample_rate, audio_length
    print("we have {} frames".format(len(frames)))
    start = time.time()
    for i, segment in enumerate(segments):
        # Run deepspeech on the chunk that just completed VAD
        print("Processing chunk %002d" % (i, ))
        audio = np.frombuffer(segment, dtype=np.int16)
        # Run Deepspeech
        print('Running inference...')
        output = ds.stt(audio)
        print("Transcript: %s" % output)
Ejemplo n.º 8
0
def transcribe_file(audio_path, ds, threadcount):

    audio, sample_rate, audio_length = wavSplit.read_wave(audio_path)
    segments = list(wavSplit.frame_generator(CHUNK_SIZE, audio, sample_rate))

    print("Beginning to process generated file chunks")
    inference_time = time.time()

    # make a set of queues for upstream and downstream communication
    WriteQueue = queue.Queue()
    ReadQueue = queue.Queue()
    workers = []

    threadcount = int(threadcount)

    for i in range(threadcount):
        x = ChunkWorker(
            WriteQueue, ReadQueue,
            ds)  # all chunks get the same queues and inference model
        x.start()
        workers.append(x)

    if len(segments) <= 1:
        print("SINGLE SEGMENT LENGTH IDENTIFIED", file=sys.stderr, flush=True)
        # this can occur when the chunk size is larger than the audio file, resulting in no chunking
        # we need to do the transcription manually
        if len(segments) == 0:
            audio_file = wave.open(audio_path, 'rb')
            output = ds.sttWithMetadata(audio_file.bytes, 1)  # Run Deepspeech
            return json.dumps(output)

    print("Workers started...", file=sys.stderr, flush=True)
    for i, segment in enumerate(segments):
        print("Writing segment num {}".format(i), file=sys.stderr, flush=True)
        WriteQueue.put({'chunk': segment, 'index': i})

    print("All Chunks sent...", file=sys.stderr, flush=True)
    for i in range(threadcount):
        print("stopping worker {}".format(i), file=sys.stderr, flush=True)
        WriteQueue.put({"index": -1})  # send a sentinel value to all threads

    for i in range(threadcount):
        workers[i].join()  # wait for all threads to join
        print(" worker {} has joined".format(i), file=sys.stderr, flush=True)

    processed_chunks = []
    for ele in list(ReadQueue.queue):
        print(ele, file=sys.stderr, flush=True)
        processed_chunks.append(ele)

    # each thread will send both the inference result
    # as well as the chunk id which is used to sort
    # the ReadQueue, allowing for asynchronous processing
    processed_chunks.sort(key=lambda p: p.get('index'))

    # because each chunk is processed discretely, each word will have a time
    # value within the range of zero and CHUNK_SIZE in seconds represented by a float value.
    # To accurately associate each word with its proper time within the media, and not within
    # the range described above, we need to add an offset to all tokens in each segment
    # - excluding the zeroth segment as it needs no adjustment.

    currentTime = 0.0
    i = 0
    adjusted_tokens = []
    offset = CHUNK_SIZE / 1000
    current_offset = 0
    if len(processed_chunks) > 1:
        while i < len(processed_chunks):
            current_item = processed_chunks[i].get('result')
            if len(
                    current_item
            ) != 0:  # if we get a chunk that has no speech within it, such as instrumental music, ignore it
                j = 0
                while j < len(current_item):
                    current_word = current_item[j]
                    current_word[
                        'time'] = current_word['time'] + current_offset
                    j = j + 1
                    adjusted_tokens.append(current_word)

                i = i + 1
            current_offset = current_offset + offset

    else:
        print("done with file; took{}".format(time.time() - inference_time),
              file=sys.stderr,
              flush=True)
        return json.dumps(processed_chunks[0].get('result'))

    print("done with file; took{}".format(time.time() - inference_time),
          file=sys.stderr,
          flush=True)

    return json.dumps(adjusted_tokens)