Ejemplo n.º 1
0
    def client_reset(self):
        # Jack setup
        self.client = jack.Client('JackAudioSink')
        self.client.blocksize = self.block_size
        self.tmp_buf = np.array([0.0] * self.block_size)
        self.tmp_buf_pos = 0
        self.is_active = False

        # Debug
        self.start_time = 0
        self.sample_count = 0
        self.xrun_count = 0

        self.port = self.client.outports.register('audio_out')

        # Make sure sample rate works and set multiplier
        if not self.allow_fractional_resample:
            if self.client.samplerate % self.orig_sample_rate != 0:
                raise (ValueError(
                    "OS sample rate " + str(self.client.samplerate) +
                    " must be evenly divisible by given sample rate " +
                    str(self.orig_sample_rate)))
            self.sample_multiplier = int(self.client.samplerate /
                                         self.orig_sample_rate)
        else:
            self.sample_multiplier = float(self.client.samplerate /
                                           self.orig_sample_rate)
        self.resampler = samplerate.Resampler('sinc_best', channels=1)

        # Callback setup
        self.client.set_process_callback(self.process)
        self.client.set_xrun_callback(self.xrun)
Ejemplo n.º 2
0
def resample(data,
             source_to_target_ratio,
             ZSCORE,
             resample_method='sinc_best',
             N_channels_max=128):

    ######################
    # If downsampling by an integer, just anti-alias and subsample??
    ######################

    # 128 is the max for the underlying library
    N_channels_max = min(N_channels_max, 128)
    N_channels = data.shape[1]
    data_mat = None

    for i0 in np.arange(0, N_channels, N_channels_max):
        iF = np.min((i0 + N_channels_max, N_channels))
        resampler = samplerate.Resampler(resample_method, channels=iF - i0)
        data_chunk = resampler.process(data[:, i0:iF],
                                       1 / source_to_target_ratio,
                                       end_of_input=True)
        data_mat = (data_chunk if data_mat is None else np.concatenate(
            (data_mat, data_chunk), axis=1))
    if ZSCORE:
        data_mat = zscore(data_mat)

    return data_mat
Ejemplo n.º 3
0
 def fn(config, inq, outq):
     import samplerate
     resampler = samplerate.Resampler('sinc_best', channels=1)
     while 1:
         x = inq.get()
         y = resampler.process(x, i)
         z = numpy.array(y, dtype="<h")
         outq.put(z)
 def process_samples(self):
     resampler = samplerate.Resampler('sinc_fastest', channels=2)
     resampler.process(np.zeros((2048, 2)),
                       self.sample_rate / self.client.samplerate)
     while True:
         self.output_data(
             resampler.process(self.sample_pipe_out.recv().T,
                               self.sample_rate / self.client.samplerate) *
             self.gain)
Ejemplo n.º 5
0
    def audio_runner(self):
        """Thread for getting data from the microphone"""

        # Find matching audio device
        p = pyaudio.PyAudio()
        self.api_id = None
        self.device_id = None
        num_apis = p.get_host_api_count()

        for j in range(0, num_apis):
            info = p.get_host_api_info_by_index(j)
            numdevices = info.get('deviceCount')
            for i in range(0, numdevices):
                if (p.get_device_info_by_host_api_device_index(j, i).get('maxInputChannels')) > 1:
                    device_name = p.get_device_info_by_host_api_device_index(j, i).get('name')
                    if self.device_name_filter in device_name:
                        self.api_id = j
                        self.device_id = i
                        print("Found device with id " + str(self.api_id) + "." + str(self.device_id) + ": " + device_name)
                        
        if self.device_id is None:
            print("No devices found that match filter. Device list:")
            for j in range(0, num_apis):
                info = p.get_host_api_info_by_index(j)
                numdevices = info.get('deviceCount')
                for i in range(0, numdevices):
                    dev_info = p.get_device_info_by_host_api_device_index(j, i)
                    #if (dev_info.get('maxInputChannels')) > 1:
                    device_name = dev_info.get('name')
                    supported_rates = self.test_sample_rates(p, i)
                    print("*", device_name, "channels:", dev_info.get('maxInputChannels'), "rates:", supported_rates)
            raise AssertionError("No device.")
        
        # Find best samplerate
        self.sample_rate = min(self.test_sample_rates(p, self.device_id))
        sample_multiplier = int(self.sample_rate / 16000)
        resampler = samplerate.Resampler('sinc_fastest', channels = 2)
        
        # Open stream
        stream = p.open(
            format = pyaudio.paInt16, 
            channels = 2,
            rate = self.sample_rate, 
            input = True,
            frames_per_buffer = 256,
            input_device_index = self.device_id,
        )

        # Record
        while self.stop_process.value == False:
            data = stream.read(256, exception_on_overflow = False)
            samples = np.frombuffer(data, dtype = 'int16').astype('float').reshape(256, 2)
            samples_res = resampler.process(samples, sample_multiplier)
            self.frame_pipe_in.send(samples_res)
        p.terminate()
Ejemplo n.º 6
0
    def __init__(self, samplerate, channels,
                 master_settings: AudioOutputSettings):
        self.stream = sd.OutputStream(samplerate=samplerate,
                                      channels=channels,
                                      blocksize=0,
                                      dtype='float32')

        self.signals = AudioOutputSignals()
        self.settings = AudioOutputSettings()

        self.master_settings = master_settings

        self.resampler = sr.Resampler(channels=channels)
Ejemplo n.º 7
0
def recognize_wav(model_name, wav_filename):
    """Генератор запросов на распознавание"""
    # Конфигурируем распознавание на модель model_name
    yield AsrService_pb2.RecognizeRequest(
        config=AsrService_pb2.RecognitionConfig(
            auth=AsrService_pb2.Auth(client_id=args.client_id,
                                     domain_id=args.domain_id,
                                     api_key=args.api_key),
            model=base_pb2.Model(id=model_name)))

    model_samplerate = 8000 if 'phone_call' in model_name.lower(
    ) or 'ivr' in model_name.lower() else 16000

    # Cчитываем wav-файл кусочками по 1 секунде
    with sf.SoundFile(wav_filename, mode='r') as wav:
        wav_samplerate = wav.samplerate
        samples_to_read = int(wav_samplerate / 5)

        # Настройки для передискретизации звука
        # Converter type:
        # sinc_best = 0, sinc_medium = 1, sinc_fastest = 2, zero_order_hold = 3, linear = 4
        # Описание типов: http://www.mega-nerd.com/libsamplerate/api_misc.html

        resampler = sr.Resampler(converter_type="sinc_best")
        resampling_ratio = float(model_samplerate) / wav_samplerate
        print("resampling_ratio: {}\n".format(resampling_ratio))

        while wav.tell() < wav.frames:
            wave_data = wav.read(samples_to_read, dtype="int16")
            sound_for_recognition = wave_data

            # Запускаем передискрет только, если частота модели и файла не совпадают
            if resampling_ratio != 1.0:
                resampled_data = resampler.process(
                    wave_data,
                    resampling_ratio,
                    end_of_input=(wav.tell() >= wav.frames)).astype(np.int16)
                sound_for_recognition = resampled_data

            # Отправляем звук на распознавание
            yield AsrService_pb2.RecognizeRequest(sound=AsrService_pb2.Sound(
                samples=sound_for_recognition.tobytes()))

            time.sleep(0.2)

    # Завершаем распознавание
    yield AsrService_pb2.RecognizeRequest(finish=base_pb2.Finish())
Ejemplo n.º 8
0
def audio_player(speed_factor=1.0):
    # Perform late imports
    # Those can fail if a linux machine doesn't have portaudio or libsamplerate installed
    import samplerate
    import sounddevice

    resampler = samplerate.Resampler("linear", channels=2)
    audio_out = AudioOut(resampler, speed_factor)
    with sounddevice.OutputStream(
            samplerate=audio_out.output_rate,
            dtype="int16",
            channels=2,
            latency="low",
            blocksize=audio_out.buffer_size,
            callback=audio_out.stream_callback,
    ):
        yield audio_out
Ejemplo n.º 9
0
    def _make_resampler(self, actual, preferred):
        try:
            import numpy
            import samplerate
        except ImportError as e:
            self._logger.warning(
                "samplerate not installed; expect glitches during playback")

            async def resample(input_queue, output_queue):
                while True:
                    data = await input_queue.get()
                    await output_queue.put(data)
                    if not data:
                        break

            return resample, actual

        resampler = samplerate.Resampler()

        def resample_worker(input_data, end):
            input_array = numpy.frombuffer(input_data, dtype="<u2")
            input_array = (input_array.astype(numpy.float32) - 32768) / 32768
            output_array = resampler.process(input_array,
                                             ratio=preferred / actual,
                                             end_of_input=end)
            output_array = (output_array * 32768 + 32768).astype(numpy.uint16)
            return output_array.tobytes()

        async def resample(input_queue, output_queue):
            while True:
                input_data = await input_queue.get()
                output_data = await asyncio.get_running_loop().run_in_executor(
                    None, resample_worker, input_data, not input_data)
                if output_data:
                    await output_queue.put(output_data)
                if not input_data:
                    await output_queue.put(b"")
                    break

        return resample, preferred
Ejemplo n.º 10
0
def recognize_from_micro(model_name, block_size_ms):
    # Конфигурируем распознавание на модель model_name и частоту model_samplerate
    yield AsrService_pb2.RecognizeRequest(config=AsrService_pb2.RecognitionConfig(
        auth=AsrService_pb2.Auth(
            client_id=args.client_id,
            domain_id=args.domain_id,
            api_key=args.api_key
        ),
        model=base_pb2.Model(id=model_name)))

    model_samplerate = 8000 if 'phone_call' in model_name.lower() or 'ivr' in model_name.lower() else 16000

    # Получаем частоту дискретизации дефолтного микрофона
    microphone_samplerate = 0
    with create_default_input_stream(user_callback=test_callback) as input_device:
        microphone_samplerate = input_device.samplerate

    # Очередь звуковых данных
    queue = Queue()

    # Callback для обработки очередной порции звука, пришедшей с микрофона
    def capture_sound_callback(indata, frames, time, status):
        if status:
            print(status, flush=True)
        queue.put(indata.copy())

    # Сколько сэмлов будем брать с микрофона
    bsize_micro = int(microphone_samplerate * block_size_ms / 1000.0)

    # Для передискретизации звука
    screen.addstr("microphone samplerate: {} \n".format(microphone_samplerate))
    screen.refresh()
    resampling_ratio = float(model_samplerate) / microphone_samplerate
    resampler = sr.Resampler(converter_type="sinc_best")

    screen.addstr("Press Q for quit\n\n")
    screen.refresh()

    # Параллельно будем писать в файл, чтобы проверить захват звука
    with sf.SoundFile("recorded.wav", mode="w", samplerate=int(model_samplerate),
                      channels=1, subtype="PCM_16") as file:

        with create_default_input_stream(user_blocksize=bsize_micro, user_callback=capture_sound_callback):
            while True:
                raw_sound_from_microphone = queue.get()


                # Выбираем только один канал и делаем передискретизацию на целевую частоту модели распознавания речи
                _, cols = raw_sound_from_microphone.shape
                raw_sound = raw_sound_from_microphone[:, :-1] if cols > 1 else raw_sound_from_microphone
                sound_for_recognition = \
                    resampler.process(raw_sound, resampling_ratio).astype(np.int16)

                # Отправляем звук на распознавание
                yield AsrService_pb2.RecognizeRequest(
                    sound=AsrService_pb2.Sound(samples=sound_for_recognition.tobytes()))

                file.write(sound_for_recognition)

                # Завершаем запись при нажатии клавиши
                if keyboard.is_pressed("q"):
                    break

    # Завершаем распознавание
    yield AsrService_pb2.RecognizeRequest(finish=base_pb2.Finish())
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser('Rearrange a sound file to match another')
    parser.add_argument('target', type=str, help='the sound file to recreate')
    parser.add_argument('palette',
                        type=str,
                        help='the sound file to recreate it from')
    parser.add_argument('out', type=str, help='the sound file to output')
    parser.add_argument(
        '--chunk-length',
        type=int,
        help='the length of each chunk the sound files are divided in',
        default=5)
    parser.add_argument('--seed',
                        type=int,
                        help='the seed of the random number generator')
    parser.add_argument(
        '--max-swap-fails',
        type=int,
        default=250,
        help='the maximum number of failed attempts to improve the quality')
    args = parser.parse_args(sys.argv[1:])

    random.seed(args.seed)

    target_data, target_samplerate = soundfile.read(args.target,
                                                    always_2d=True,
                                                    dtype='float32')
    palette_data, palette_samplerate = soundfile.read(args.palette,
                                                      always_2d=True,
                                                      dtype='float32')

    if palette_samplerate != target_samplerate:
        print('Resampling palette...')
        progress_bar = progressbar.ProgressBarThread()
        progress_bar.start()

        resampler = libsamplerate.Resampler(
            libsamplerate.converters.ConverterType.sinc_best, channels=2)
        palette_data = resampler.process(
            palette_data, target_samplerate / palette_samplerate)

        progress_bar.stop()

    sample_rate = target_samplerate
    del target_samplerate
    del palette_samplerate

    print('Chopping up sound files...')
    progress_bar = progressbar.ProgressBarThread()
    progress_bar.start()

    samples_per_chunk = sample_rate * args.chunk_length // 1000
    target_chunks = divide_sound_file(target_data, samples_per_chunk)
    palette_chunks = divide_sound_file(palette_data, samples_per_chunk)

    progress_bar.stop()

    print('Moulding palette...')
    progress_bar = progressbar.ProgressBarThread()
    progress_bar.start()

    # occ = {}
    result_chunks = stretch_palette(palette_chunks, len(target_chunks))
    assert len(result_chunks) == len(target_chunks)

    progress_bar.stop()

    print('Normalizing chunks...')
    progress_bar = progressbar.ProgressBarThread(2 * len(target_chunks))

    for i, chunk in enumerate(target_chunks):
        length, num_channels = chunk.shape
        if length < samples_per_chunk:
            target_chunks[i] = numpy.append(chunk, [[0, 0]] *
                                            (samples_per_chunk - length),
                                            axis=0)
            assert target_chunks[i].shape == (samples_per_chunk, 2)
        progress_bar.progress(i)

    for i, chunk in enumerate(result_chunks):
        length, num_channels = chunk.shape
        if length < samples_per_chunk:
            result_chunks[i] = numpy.append(chunk, [[0, 0]] *
                                            (samples_per_chunk - length),
                                            axis=0)
            assert result_chunks[i].shape == (samples_per_chunk, 2)
        progress_bar.progress(i)

    progress_bar.stop()

    print('Maximizing sound quality...')
    progress_bar = progressbar.ProgressBarThread(args.max_swap_fails)
    progress_bar.start()

    num_failed_swaps = 0
    num_failed_swaps_hist = [num_failed_swaps]
    total_swaps = 0
    total_iters = 0
    while num_failed_swaps < args.max_swap_fails:
        i, k = random.sample(range(0, len(target_chunks)), k=2)
        cdiff_i = ((result_chunks[i] - target_chunks[i])**2).mean()
        cdiff_k = ((result_chunks[k] - target_chunks[k])**2).mean()
        cdiff = cdiff_i + cdiff_k
        ndiff_i = ((result_chunks[k] - target_chunks[i])**2).mean()
        ndiff_k = ((result_chunks[i] - target_chunks[k])**2).mean()
        ndiff = ndiff_i + ndiff_k
        if ndiff < cdiff:
            num_failed_swaps_hist = num_failed_swaps_hist[-args.
                                                          max_swap_fails:]
            num_failed_swaps_hist.append(num_failed_swaps)
            num_failed_swaps = 0
            total_swaps += 1
            result_chunks[i], result_chunks[k] = result_chunks[
                k], result_chunks[i]
        else:
            num_failed_swaps += 1
        progress_bar.progress(numpy.average(num_failed_swaps_hist),
                              's: {}, t: {}'.format(total_swaps, total_iters))
        total_iters += 1

    progress_bar.stop()

    soundfile.write(args.out, numpy.concatenate(result_chunks), sample_rate)
Ejemplo n.º 12
0
    def __init__(self, input_device):
        logger.debug('MicrophoneStream INIT')
        self._audio_interface = pyaudio.PyAudio()
        self._inputDeviceName = input_device
        self._inputDeviceIndex = None
        self._num_channels = 1
        self._format = pyaudio.paInt16
        self._outputSampleRate = SAMPLERATE

        self.meterQueue = None
        self.meter_peak_np = np.empty(0, dtype=np.int16)
        self.meter_time = float(0)

        self.recordingFilename = None

        numdevices = self._audio_interface.get_default_host_api_info().get(
            'deviceCount')
        defaultHostAPIindex = self._audio_interface.get_default_host_api_info(
        ).get('index')
        defaultInputDeviceIndex = self._audio_interface.get_default_input_device_info(
        ).get('index')
        defaultInputDeviceName = self._audio_interface.get_default_input_device_info(
        ).get('name')

        for i in range(0, numdevices):
            inputDevice = self._audio_interface.get_device_info_by_host_api_device_index(
                defaultHostAPIindex, i)
            if self._inputDeviceName == inputDevice.get('name'):
                self._inputDeviceIndex = inputDevice.get('index')
                break

        if self._inputDeviceIndex is None:
            self._inputDeviceName = defaultInputDeviceName
            self._inputDeviceIndex = defaultInputDeviceIndex

        deviceInfo = self._audio_interface.get_device_info_by_index(
            self._inputDeviceIndex)
        self._rate = int(deviceInfo.get('defaultSampleRate'))

        try:
            if self._audio_interface.is_format_supported(
                    self._outputSampleRate,
                    input_device=self._inputDeviceIndex,
                    input_channels=self._num_channels,
                    input_format=self._format):
                self._rate = self._outputSampleRate
        except ValueError:
            pass

        self.resampler = sr.Resampler()
        self.resampler_ratio = self._outputSampleRate / self._rate

        self._chunk_size = int(self._rate / 10)

        self._wavfile = None

        # Create a thread-safe buffer of audio data
        self._streamBuff = queue.Queue()
        self._recordingBuff = queue.Queue()
        self.closed = True

        # 2 bytes in 16 bit samples
        self._bytes_per_sample = 2 * self._num_channels
        self._bytes_per_second = self._rate * self._bytes_per_sample

        self._bytes_per_chunk = (self._chunk_size * self._bytes_per_sample)
        self._chunks_per_second = (self._bytes_per_second //
                                   self._bytes_per_chunk)
Ejemplo n.º 13
0
Archivo: listen.py Proyecto: Anfa-am/q
import noisereduce as nr
import numpy as np
import samplerate as sr
from pyaudio import PyAudio, Stream, paInt16
from contextlib import asynccontextmanager, contextmanager, AsyncExitStack
from typing import AsyncGenerator, Generator
from threading import Thread
import engines.blockout

server = 'localhost:2700'
loop = asyncio.get_event_loop()
listening = True
recording = True

denoiser = engines.blockout.RNNoise()
resampler = sr.Resampler()


def listen1():
    global recording
    while True:
        try:
            user_input = input("Type something to begin...\n")

            if (user_input == 'start'):
                recording = True
                start()

            if (user_input == 'stop'):
                print('stopping')
                recording = False
Ejemplo n.º 14
0
def test_match(data, converter_type, ratio=2.0):
    num_channels, input_data = data
    output_simple = samplerate.resample(input_data, ratio, converter_type)
    resampler = samplerate.Resampler(converter_type, channels=num_channels)
    output_full = resampler.process(input_data, ratio, end_of_input=True)
    assert np.allclose(output_simple, output_full)
Ejemplo n.º 15
0
def test_process(data, converter_type, ratio=2.0):
    num_channels, input_data = data
    src = samplerate.Resampler(converter_type, num_channels)
    src.process(input_data, ratio)
Ejemplo n.º 16
0
def decode_chunked_partial_endpointing_mic(
        asr,
        feat_info,
        decodable_opts,
        paudio,
        input_microphone_id,
        channels=1,
        samp_freq=16000,
        record_samplerate=16000,
        chunk_size=1024,
        wait_for_start_command=False,
        record_message_history=False,
        compute_confidences=True,
        asr_client=None,
        speaker_str="Speaker",
        resample_algorithm="sinc_best",
        save_debug_wav=False,
        use_threads=False,
        minimum_num_frames_decoded_per_speaker=5,
        mic_vol_cutoff=0.5,
        use_local_mic=True,
        decode_control_channel='asr_control',
        audio_data_channel='asr_audio'):

    # Subscribe to command and control redis channel
    p = red.pubsub()
    p.subscribe(decode_control_channel)

    if not use_local_mic:
        pa = red.pubsub()
        pa.subscribe(audio_data_channel)

    # Figure out if we need to resample (Todo: channles does not seem to work)
    need_resample = False
    if record_samplerate != samp_freq:
        print(
            "Activating resampler since record and decode samplerate are different:",
            record_samplerate, "->", samp_freq)
        resampler = samplerate.Resampler(resample_algorithm, channels=channels)
        need_resample = True
        ratio = samp_freq / record_samplerate
        print("Resample ratio:", ratio)

    # Initialize Python/Kaldi bridge
    print("Constructing decoding pipeline")
    adaptation_state = OnlineIvectorExtractorAdaptationState.from_info(
        feat_info.ivector_extractor_info)
    key = 'mic' + str(input_microphone_id)
    feat_pipeline, sil_weighting = initNnetFeatPipeline(
        adaptation_state, asr, decodable_opts, feat_info)
    print("Done")

    speaker = speaker_str.replace("#c#", "0")
    last_chunk = False
    utt, part = 1, 1
    prev_num_frames_decoded, offset_complete = 0, 0
    chunks_decoded = 0
    num_chunks = 0
    blocks = []
    rawblocks = []

    if use_local_mic:
        # Open microphone channel
        print("Open microphone stream with id" + str(input_microphone_id) +
              "...")
        stream = paudio.open(format=pyaudio.paInt16,
                             channels=channels,
                             rate=record_samplerate,
                             input=True,
                             frames_per_buffer=chunk_size,
                             input_device_index=input_microphone_id)
        print("Done!")

    do_decode = not wait_for_start_command
    need_finalize = False
    block, previous_block = None, None
    decode_future = None

    # Send event (with redis) to the front that ASR session is ready
    asr_client.asr_ready(speaker=speaker)

    # Initialize a ThreadPoolExecutor.
    # Note that we initialize the thread executer independently of whether we actually use it later (the -t option).
    # At the end of this loop we have two code paths, one that uses a computation future (with -t) and one without it.
    with ThreadPoolExecutor(max_workers=1) as executor:
        while not last_chunk:
            # Check if there is a message from the redis server first (non-blocking!), if there is no new message msh is simply None.
            msg = p.get_message()

            # We check if there are externally send control commands
            if msg is not None:
                print('msg:', msg)
                if msg['data'] == b"start":
                    print('Start command received!')
                    do_decode = True
                    asr_client.sendstatus(isDecoding=do_decode)

                elif msg['data'] == b"stop":
                    print('Stop command received!')
                    if do_decode and prev_num_frames_decoded > 0:
                        need_finalize = True
                    do_decode = False
                    asr_client.sendstatus(isDecoding=do_decode)

                elif msg['data'] == b"shutdown":
                    print('Shutdown command received!')
                    last_chunk = True

                elif msg['data'] == b"status":
                    print('Status command received!')
                    asr_client.sendstatus(isDecoding=do_decode)

                elif msg['data'] == b"reset_timer":
                    print('Reset time command received!')
                    asr_client.resetTimer()

            if use_local_mic:
                # We always consume from the microphone stream, even if we do not decode
                block_raw = stream.read(chunk_size,
                                        exception_on_overflow=False)
                npblock = np.frombuffer(block_raw, dtype=np.int16)
            else:
                block_audio_redis_msg = next(pa.listen())
                if block_audio_redis_msg[
                        'type'] == "subscribe" and block_audio_redis_msg[
                            "data"] == 1:
                    print('audio msg:', block_audio_redis_msg)
                    print("Successfully connected to redis audio stream!")
                    continue
                else:
                    npblock = np.frombuffer(block_audio_redis_msg['data'],
                                            dtype=np.int16)
                    # print("audio data: ", npblock)

            # Resample the block if necessary, e.g. 48kHz -> 16kHz
            if need_resample:
                block = resampler.process(np.array(npblock, copy=True), ratio)
                block = np.array(block, dtype=np.int16)
            else:
                block = npblock

            # Only save the wav, if the save_debug flag is enabled (TODO: investigate: does not seem to work with multiple channels)
            if save_debug_wav:
                blocks.append(block)
                rawblocks.append(npblock)

            # Block on the result of the decode if one is pending
            if use_threads and do_decode and block is not None and decode_future is not None:

                # This call blocks until the result is ready
                need_endpoint_finalize, prev_num_frames_decoded, part, utt = decode_future.result(
                )

                # Check if we need to finalize, disallow endpoint without a single decoded frame
                if need_endpoint_finalize and prev_num_frames_decoded > 0:
                    need_finalize = True
                    resend_previous_waveform = True
                    print("prev_num_frames_decoded:", prev_num_frames_decoded)

                if need_endpoint_finalize and prev_num_frames_decoded == 0:
                    print(
                        "WARN need_endpoint_finalize and prev_num_frames_decoded == 0"
                    )

            # Finalize the decoding here, if endpointing signalized that we should start a new utterance.
            # We might also need to finalize if we switch from do_decode=True to do_decode=False (user starts/stops decoding from frontend).
            if need_finalize and block is not None and prev_num_frames_decoded > 0:
                print("prev_num_frames_decoded:", prev_num_frames_decoded)
                out, confd = finalize_decode(asr, asr_client, key, part,
                                             speaker, utt)
                feat_pipeline, sil_weighting = reinitialize_asr(
                    adaptation_state, asr, feat_info, feat_pipeline)
                utt += 1
                part = 1

                if resend_previous_waveform and previous_block is not None:
                    # We always resend the last block for the new utterance (we only know that the endpoint is inside of a chunk, but not where exactly)
                    feat_pipeline.accept_waveform(samp_freq,
                                                  Vector(previous_block))
                    resend_previous_waveform = False

                need_finalize = False

                prev_num_frames_decoded = 0

            # If we operate on multichannel data, select the channel here that has the highest volume
            # (with some added heuristic, only change the speaker if the previous speaker was active for minimum_num_frames_decoded_per_speaker many frames)
            if channels > 1:
                block = np.reshape(block, (-1, channels))

                # Select loudest channel
                volume_norms = []
                for i in range(channels):
                    # We have a simplyfied concept of loudness, it is simply the L2 of the chunk interpreted as a vector (sqrt of the sum of squares):
                    # This has nothing to do with the physical loudness.

                    volume_norms.append(
                        np.linalg.norm(block[:, i] / 65536.0) * 10.0)
                    #print("|" * int(volume_norm))

                #print('vols:', volume_norms)

                volume_norms = [
                    0.0 if elem < mic_vol_cutoff else elem
                    for elem in volume_norms
                ]

                volume_norm = max(volume_norms)
                max_channel = volume_norms.index(volume_norm)
                block = block[:, max_channel]

                new_speaker = speaker_str.replace("#c#", str(max_channel))

                #print('vols:',volume_norms, 'max:',max_channel, 'value:',volume_norm)

                if sum(volume_norms) > 1e-10 and new_speaker != speaker \
                        and prev_num_frames_decoded >= minimum_num_frames_decoded_per_speaker:
                    print(
                        "Speaker change! Number of frames decoded for previous speaker:",
                        str(prev_num_frames_decoded))

                    speaker = new_speaker

                    need_finalize = True
                    resend_previous_waveform = True

                    #prev_num_frames_decoded = 0
            else:
                volume_norm = np.linalg.norm(block / 65536.0) * 10.0

            num_chunks += 1

            # Send status beacon periodically (to frontend, so its knows we are alive)
            if num_chunks % 50 == 0:
                asr_client.sendstatus(isDecoding=do_decode)

            if do_decode:
                # If we use the unthreaded mode, we block until the computation here in this loop
                if not use_threads:
                    need_endpoint_finalize, prev_num_frames_decoded, part, utt = advance_mic_decoding(
                        adaptation_state, asr, asr_client, block,
                        chunks_decoded, feat_info, feat_pipeline, key,
                        last_chunk, part, prev_num_frames_decoded, samp_freq,
                        sil_weighting, speaker, utt)
                    # Check if we need to finalize, disallow endpoint without a single decoded frame
                    if need_endpoint_finalize and prev_num_frames_decoded > 0:
                        need_finalize = True
                        resend_previous_waveform = True
                        print("prev_num_frames_decoded:",
                              prev_num_frames_decoded)

                else:
                    # In threaded mode, we submit a non blocking computation request to the thread executor
                    decode_future = executor.submit(
                        advance_mic_decoding, adaptation_state, asr,
                        asr_client, block, chunks_decoded, feat_info,
                        feat_pipeline, key, last_chunk, part,
                        prev_num_frames_decoded, samp_freq, sil_weighting,
                        speaker, utt)
            else:
                time.sleep(0.001)

            previous_block = block

    # Record message history as an integrated Python file, that can be used as a standalone replay
    if record_message_history:
        with open('message_history_replay.py', 'w') as message_history_out:
            message_history_out.write(asr_client.message_trace)
    else:
        print(
            "Not writing record message history since --record_message_history is not set."
        )

    # Write debug wav as output file (will only be executed after shutdown)
    if save_debug_wav:
        print("Saving debug output...")
        wavefile.write("debug.wav", samp_freq, np.concatenate(blocks,
                                                              axis=None))
        wavefile.write("debugraw.wav", record_samplerate,
                       np.concatenate(rawblocks, axis=None))
    else:
        print(
            "Not writing debug wav output since --save_debug_wav is not set.")

    # Now shuting down pipeline, compute MBR for the final utterance and complete it.
    print("Shutdown: finalizing ASR output...")
    asr.finalize_decoding()
    out = asr.get_output()
    mbr = MinimumBayesRisk(out["lattice"])
    confd = mbr.get_one_best_confidences()
    print(out)
    # print(key + "-utt%d-final" % utt, out["text"], flush=True)
    if asr_client is not None:
        asr_client.completeUtterance(utterance=out["text"],
                                     key=key + "-utt%d-part%d" % (utt, part),
                                     confidences=confd,
                                     speaker=speaker)
        asr_client.sendstatus(isDecoding=False, shutdown=True)
    print("Done, will exit now.")
Ejemplo n.º 17
0
dev_index = 2 # device index found by p.get_device_info_by_index(ii)
wav_output = 'test1.wav' # name of .wav file
converter = 'sinc_best'  # or 'sinc_fastest', ...
audio = pyaudio.PyAudio() # create pyaudio instantiation

# create pyaudio stream
stream = audio.open(format = sample_format,
                    channels = channels,
                    rate = samp_rate,
                    frames_per_buffer=chunk,
                    input_device_index = dev_index,
                    input = True
                    )
print("recording")

resampler = sr.Resampler(converter,channels)
ratio = target_rate / samp_rate

raw_frames = []
frames = []
# loop through stream and append audio chunks to frame array
for i in range(0,int(samp_rate/chunk*record_secs)):
    raw_data = stream.read(chunk,exception_on_overflow = False)
    #raw_frames.append(raw_data)
    frames.append(raw_data)
print("finished recording")
# stop the stream, close it, and terminate the pyaudio instantiation
stream.stop_stream()
stream.close()
audio.terminate()
#print("start encoding")