def __init__(self, user_connection):
     self.__resampler = AudioResampler(format='s16', layout='mono', rate=48000)
     self.__pc = user_connection
     self.__audio_task = None
     self.__track = None
     self.__channel = None
     self.__recognizer = KaldiRecognizer(model, 48000)
Exemple #2
0
    def __init__(self, user_connection, kaldi_server):
        self.__resampler = AudioResampler(format='s16',
                                          layout='mono',
                                          rate=kaldi_server.samplerate)

        self.__pc = user_connection
        self.__audio_task = None
        self.__text_task = None

        self.__ks = kaldi_server
        self.__kaldi_reader = None
        self.__kaldi_writer = None

        self.__channel = None
Exemple #3
0
class KaldiTask:
    def __init__(self, user_connection):
        self.__resampler = AudioResampler(format='s16', layout='mono', rate=8000)
        self.__pc = user_connection
        self.__audio_task = None
        self.__track = None
        self.__channel = None
        self.__recognizer = KaldiRecognizer(model, 48000)

    async def set_audio_track(self, track):
        self.__track = track

    async def set_text_channel(self, channel):
        self.__channel = channel

    async def start(self):
        self.__audio_task = asyncio.create_task(self.__run_audio_xfer())

    async def stop(self):
        if self.__audio_task is not None:
            self.__audio_task.cancel()
            self.__audio_task = None

    async def __run_audio_xfer(self):
        while True:
            frame = await self.__track.recv()
            print ("Got frame", frame)
            frame = self.__resampler.resample(frame)
            data = frame.to_ndarray()
            response = await loop.run_in_executor(pool, process_chunk, self.__recognizer, data.tobytes())
            print ("Sending response", response)
            self.__channel.send(response)
Exemple #4
0
 def play(self, address, sender=None):
     container = av.open(address)
     stream = next(stream for stream in container.streams
                   if stream.type == 'audio')
     if not stream:
         return
     self._stream = (container, stream)
     self._resampler = AudioResampler('s16p', 1, 48000)
Exemple #5
0
    def __init__(self, user_connection, kaldi_server):
        self.__resampler = AudioResampler(format='s16',
                                          layout='mono',
                                          rate=kaldi_server.samplerate)

        self.__pc = user_connection
        self.__audio_task = None
        self.__text_task = None

        self.__ks = kaldi_server
        self.__kaldi_reader = None
        self.__kaldi_writer = None

        self.__channel = None

        self.__command = list()
        with open('/server/command.txt', 'r') as file:
            for line in file:
                self.__command.append(line.strip().split())
        print(self.__command)
class KaldiTask:
    def __init__(self, user_connection):
        self.__resampler = AudioResampler(format='s16', layout='mono', rate=48000)
        self.__pc = user_connection
        self.__audio_task = None
        self.__track = None
        self.__channel = None
        self.__recognizer = KaldiRecognizer(model, 48000)


    async def set_audio_track(self, track):
        self.__track = track

    async def set_text_channel(self, channel):
        self.__channel = channel

    async def start(self):
        self.__audio_task = asyncio.create_task(self.__run_audio_xfer())

    async def stop(self):
        if self.__audio_task is not None:
            self.__audio_task.cancel()
            self.__audio_task = None

    async def __run_audio_xfer(self):
        dataframes = bytearray(b"")
        while True:
            frame = await self.__track.recv()
            frame = self.__resampler.resample(frame)
            max_frames_len = 8000
            message = frame.planes[0].to_bytes()
            recv_frames = bytearray(message)
            dataframes += recv_frames
            if len(dataframes) > max_frames_len:
                wave_bytes = bytes(dataframes)
                response = await loop.run_in_executor(pool, process_chunk, self.__recognizer, wave_bytes)
                print(response)
                self.__channel.send(response)
                dataframes = bytearray(b"")
Exemple #7
0
    def audio_encoding(self, codec):

        if not codec in av.codec.codecs_availible:
            raise SkipTest()

        encoder = av.Encoder(codec)
        if encoder.codec.experimental:
            raise SkipTest()

        sample_fmt  = encoder.codec.audio_formats[-1].name

        sample_rate = 48000
        channel_layout = "stereo"
        channels = 2
        encoder.time_base = sample_rate
        encoder.sample_rate = sample_rate
        encoder.sample_fmt = sample_fmt
        encoder.channels = channels
        encoder.open()

        resampler = AudioResampler(sample_fmt, channel_layout, sample_rate)

        container = av.open(fate_suite('audio-reference/chorusnoise_2ch_44kHz_s16.wav'))
        audio_stream = next(s for s in container.streams if s.type == 'audio')
        path = self.sandboxed('encoder.%s' % codec)

        samples = 0
        packet_sizes = []

        test_bad = True

        with open(path, 'w') as f:
            for frame in iter_frames(container, audio_stream):

                if test_bad:
                    bad_resampler = AudioResampler(sample_fmt, "mono", sample_rate)
                    bad_frame = bad_resampler.resample(frame)
                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    bad_resampler = AudioResampler(sample_fmt, channel_layout, 3000)
                    bad_frame = bad_resampler.resample(frame)

                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    bad_resampler = AudioResampler('u8', channel_layout, 3000)
                    bad_frame = bad_resampler.resample(frame)

                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    test_bad = False

                resampled_frame = resampler.resample(frame)
                samples += resampled_frame.samples
                for new_packet in encoder.encode(resampled_frame):
                    # bytearray because python can
                    # freaks out if the first byte is NULL
                    f.write(bytearray(new_packet))
                    packet_sizes.append(new_packet.size)

            for new_packet in encoder.flush():
                packet_sizes.append(new_packet.size)
                f.write(bytearray(new_packet))

        decoder = av.Decoder(codec)
        decoder.time_base = sample_rate
        decoder.sample_rate = sample_rate
        decoder.sample_fmt = sample_fmt
        decoder.channels = channels
        decoder.open()

        result_samples = 0

        # should have more asserts but not sure what to check
        # libav and ffmpeg give different results
        # so can really use checksums
        for frame in iter_raw_frames(path, packet_sizes, decoder):
            result_samples += frame.samples
            self.assertEqual(frame.rate, sample_rate)
            self.assertEqual(len(frame.layout.channels), channels)
Exemple #8
0
    def audio_encoding(self, codec_name):

        try:
            codec = Codec(codec_name, 'w')
        except UnknownCodecError:
            raise SkipTest()

        ctx = codec.create()
        if ctx.codec.experimental:
            raise SkipTest()

        sample_fmt = ctx.codec.audio_formats[-1].name
        sample_rate = 48000
        channel_layout = "stereo"
        channels = 2

        ctx.time_base = Fraction(1) / sample_rate
        ctx.sample_rate = sample_rate
        ctx.format = sample_fmt
        ctx.layout = channel_layout
        ctx.channels = channels

        ctx.open()

        resampler = AudioResampler(sample_fmt, channel_layout, sample_rate)

        container = av.open(
            fate_suite('audio-reference/chorusnoise_2ch_44kHz_s16.wav'))
        audio_stream = container.streams.audio[0]

        path = self.sandboxed('encoder.%s' % codec_name)

        samples = 0
        packet_sizes = []

        test_bad = False

        with open(path, 'wb') as f:
            for frame in iter_frames(container, audio_stream):

                # We need to let the encoder retime.
                frame.pts = None

                if test_bad:

                    bad_resampler = AudioResampler(sample_fmt, "mono",
                                                   sample_rate)
                    bad_frame = bad_resampler.resample(frame)
                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    bad_resampler = AudioResampler(sample_fmt, channel_layout,
                                                   3000)
                    bad_frame = bad_resampler.resample(frame)

                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    bad_resampler = AudioResampler('u8', channel_layout, 3000)
                    bad_frame = bad_resampler.resample(frame)

                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    test_bad = False

                resampled_frame = resampler.resample(frame)
                samples += resampled_frame.samples

                for packet in ctx.encode(resampled_frame):
                    # bytearray because python can
                    # freaks out if the first byte is NULL
                    f.write(bytearray(packet))
                    packet_sizes.append(packet.size)

            for packet in ctx.encode(None):
                packet_sizes.append(packet.size)
                f.write(bytearray(packet))

        ctx = Codec(codec_name, 'r').create()
        ctx.time_base = Fraction(1) / sample_rate
        ctx.sample_rate = sample_rate
        ctx.format = sample_fmt
        ctx.layout = channel_layout
        ctx.channels = channels
        ctx.open()

        result_samples = 0

        # should have more asserts but not sure what to check
        # libav and ffmpeg give different results
        # so can really use checksums
        for frame in iter_raw_frames(path, packet_sizes, ctx):
            result_samples += frame.samples
            self.assertEqual(frame.rate, sample_rate)
            self.assertEqual(len(frame.layout.channels), channels)
Exemple #9
0
class KaldiSink:
    """
    This class is a proxy between the client browser (aka peer connection) and the Kaldi server.

    It creates 2 tasks that transfer data between the two:
    1. __run_audio_xfer transfers audio from the browser (mic) to the Kaldi server
    2. __run_text_xfer transfers text from the Kaldi server to the browser
    """
    def __init__(self, user_connection, kaldi_server):
        self.__resampler = AudioResampler(format='s16',
                                          layout='mono',
                                          rate=kaldi_server.samplerate)

        self.__pc = user_connection
        self.__audio_task = None
        self.__text_task = None

        self.__ks = kaldi_server
        self.__kaldi_reader = None
        self.__kaldi_writer = None

        self.__channel = None

    async def set_audio_track(self, track):
        self.__track = track

    async def set_text_channel(self, channel):
        self.__channel = channel

    async def start(self):
        try:
            self.__kaldi_reader, self.__kaldi_writer = await open_connection(
                host=self.__ks.host, port=self.__ks.port)
        except:
            log.exception("Error opening conenction to Kaldi server")
            self.__pc.close()
            await self.__ks.free()
            return
        log.info(
            f'Connected to Kaldi server {self.__ks.host}:{self.__ks.port}...')
        self.__audio_task = create_task(self.__run_audio_xfer())
        self.__text_task = create_task(self.__run_text_xfer())

    async def stop(self):
        if self.__audio_task is not None:
            self.__audio_task.cancel()
            self.__audio_task = None
        if self.__text_task is not None:
            self.__text_task.cancel()
            self.__text_task = None
        if self.__kaldi_writer:
            self.__kaldi_writer.close()
            self.__kaldi_writer = None
            await self.__ks.free()

    async def __run_audio_xfer(self):
        while True:
            try:
                frame = await self.__track.recv()
                frame = self.__resampler.resample(frame)
                data = frame.to_ndarray()
                self.__kaldi_writer.write(data.tobytes())
                await self.__kaldi_writer.drain(
                )  #without this we won't catch any write exceptions
            except:
                self.__kaldi_writer.close()
                await self.__ks.free()
                return

    async def __run_text_xfer(self):
        await sleep(1)  # this is useful to
        self.__channel.send(
            '<s>\r'
        )  # this is only sent to inform the web UI we are ready to send data
        # since the above token doesn't end with \n it will be erased once Kaldi recognizes something
        while True:
            a = await self.__kaldi_reader.read(256)
            self.__channel.send(str(a, encoding='utf-8'))
Exemple #10
0
class KaldiSink:
    """
    This class is a proxy between the client browser (aka peer connection) and the Kaldi server.

    It creates 2 tasks that transfer data between the two:
    1. __run_audio_xfer transfers audio from the browser (mic) to the Kaldi server
    2. __run_text_xfer transfers text from the Kaldi server to the browser
    """
    def __init__(self, user_connection, kaldi_server):
        self.__resampler = AudioResampler(format='s16',
                                          layout='mono',
                                          rate=kaldi_server.samplerate)

        self.__pc = user_connection
        self.__audio_task = None
        self.__text_task = None

        self.__ks = kaldi_server
        self.__kaldi_reader = None
        self.__kaldi_writer = None

        self.__channel = None

        self.__command = list()
        with open('/server/command.txt', 'r') as file:
            for line in file:
                self.__command.append(line.strip().split())
        print(self.__command)

    async def set_audio_track(self, track):
        self.__track = track

    async def set_text_channel(self, channel):
        self.__channel = channel

    async def start(self):
        try:
            self.__kaldi_reader, self.__kaldi_writer = await open_connection(
                host=self.__ks.host, port=self.__ks.port)
        except:
            log.exception("Error opening conenction to Kaldi server")
            self.__pc.close()
            await self.__ks.free()
            return
        log.info(
            f'Connected to Kaldi server {self.__ks.host}:{self.__ks.port}...')
        self.__audio_task = create_task(self.__run_audio_xfer())
        self.__text_task = create_task(self.__run_text_xfer())

    async def stop(self):
        if self.__audio_task is not None:
            self.__audio_task.cancel()
            self.__audio_task = None
        if self.__text_task is not None:
            self.__text_task.cancel()
            self.__text_task = None
        if self.__kaldi_writer:
            self.__kaldi_writer.close()
            self.__kaldi_writer = None
            await self.__ks.free()

    async def __run_audio_xfer(self):
        while True:
            try:
                frame = await self.__track.recv()
                frame = self.__resampler.resample(frame)
                data = frame.to_ndarray()
                data2 = data.tobytes()
                self.__kaldi_writer.write(data2)
                await self.__kaldi_writer.drain(
                )  #without this we won't catch any write exceptions
            except:
                self.__kaldi_writer.close()
                await self.__ks.free()
                return

    @staticmethod
    def dist(x, y):
        if x == y: return 0
        elif x[0] == y[0]:
            return abs(len(x) - len(y)) / (len(x) + len(y)) * 0.25 + 0.5
        else:
            return abs(len(x) - len(y)) / (len(x) + len(y)) * 0.5 + 0.5

    @staticmethod
    def DTW(s1, s2):

        arr = np.zeros((len(s1) + 1, len(s2) + 1))
        arr[:, :] = np.inf
        for i in range(1, len(s1) + 1):
            for j in range(1, len(s2) + 1):
                row = i - 1
                col = j - 1
                if i == 1 and j == 1:
                    arr[i, j] = dist(s1[row], s2[col])
                    continue

                arr[i, j] = min(arr[i, j-1] + dist(s1[row],s2[col]),\
                                arr[i-1, j-1] + 2*dist(s1[row],s2[col]),\
                                arr[i-1, j] + dist(s1[row],s2[col]))

        return arr[-1, -1]

    @staticmethod
    def __find_best_match(test, all_command):
        mn = np.inf
        best_match = ''
        for cmd in all_command:
            print('test', cmd)
            distance = DTW(cmd, test)
            if mn > distance:
                mn = distance
                best_match = cmd
        return best_match, distance

    async def __run_text_xfer(self):
        await sleep(1)  # this is useful to
        self.__channel.send(
            '<s>\r'
        )  # this is only sent to inform the web UI we are ready to send data
        # since the above token doesn't end with \n it will be erased once Kaldi recognizes something
        while True:
            a = await self.__kaldi_reader.read(256)

            print(a)
            print('kaldi', str(a, encoding='utf-8'))
            t = str(a, encoding='utf-8').split(' ')

            print('after split', t)

            # b, d = self.__find_best_match(t, self.__command)
            # print('kaldi res', (' ').join(b))
            self.__channel.send(str(a, encoding='utf-8'))
Exemple #11
0
    def audio_encoding(self, codec_name):

        try:
            codec = Codec(codec_name, 'w')
        except UnknownCodecError:
            raise SkipTest()

        ctx = codec.create()
        if ctx.codec.experimental:
            raise SkipTest()

        sample_fmt = ctx.codec.audio_formats[-1].name
        sample_rate = 48000
        channel_layout = "stereo"
        channels = 2

        ctx.time_base = Fraction(1) / sample_rate
        ctx.sample_rate = sample_rate
        ctx.format = sample_fmt
        ctx.layout = channel_layout
        ctx.channels = channels

        ctx.open()

        resampler = AudioResampler(sample_fmt, channel_layout, sample_rate)

        container = av.open(fate_suite('audio-reference/chorusnoise_2ch_44kHz_s16.wav'))
        audio_stream = container.streams.audio[0]

        path = self.sandboxed('encoder.%s' % codec_name)

        samples = 0
        packet_sizes = []

        test_bad = False

        with open(path, 'wb') as f:
            for frame in iter_frames(container, audio_stream):

                # We need to let the encoder retime.
                frame.pts = None

                if test_bad:

                    bad_resampler = AudioResampler(sample_fmt, "mono", sample_rate)
                    bad_frame = bad_resampler.resample(frame)
                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    bad_resampler = AudioResampler(sample_fmt, channel_layout, 3000)
                    bad_frame = bad_resampler.resample(frame)

                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    bad_resampler = AudioResampler('u8', channel_layout, 3000)
                    bad_frame = bad_resampler.resample(frame)

                    with self.assertRaises(ValueError):
                        next(encoder.encode(bad_frame))

                    test_bad = False

                resampled_frame = resampler.resample(frame)
                samples += resampled_frame.samples

                for packet in ctx.encode(resampled_frame):
                    # bytearray because python can
                    # freaks out if the first byte is NULL
                    f.write(bytearray(packet))
                    packet_sizes.append(packet.size)

            for packet in ctx.encode(None):
                packet_sizes.append(packet.size)
                f.write(bytearray(packet))

        ctx = Codec(codec_name, 'r').create()
        ctx.time_base = Fraction(1) / sample_rate
        ctx.sample_rate = sample_rate
        ctx.format = sample_fmt
        ctx.layout = channel_layout
        ctx.channels = channels
        ctx.open()

        result_samples = 0

        # should have more asserts but not sure what to check
        # libav and ffmpeg give different results
        # so can really use checksums
        for frame in iter_raw_frames(path, packet_sizes, ctx):
            result_samples += frame.samples
            self.assertEqual(frame.rate, sample_rate)
            self.assertEqual(len(frame.layout.channels), channels)
Exemple #12
0
from av.audio.resampler import AudioResampler

import udp_ep
import replier

addr = '0.0.0.0'
port = 8080

logger = logging.getLogger('http_ep')
pcs = set()
datagram_endpoint = udp_ep.Endpoint()

target_audio_format = AudioFormat('s16')
target_audio_layout = AudioLayout('mono')
target_sample_rate = 44100
resampler = AudioResampler(target_audio_format, target_audio_layout,
                           target_sample_rate)


async def handler(request):
    uri = str(request.rel_url)
    logger.info('Request: {}'.format(uri))
    if uri == '/start':
        udp_ep.status = "start"
        return web.Response(text=udp_ep.status)
    elif uri == '/stop':
        udp_ep.status = "stop"
        return web.Response(text=udp_ep.status)
    elif uri == '/status':
        return web.Response(text=udp_ep.status)
    elif uri == '/offer':