def __init__(self, user_connection): self.__resampler = AudioResampler(format='s16', layout='mono', rate=48000) self.__pc = user_connection self.__audio_task = None self.__track = None self.__channel = None self.__recognizer = KaldiRecognizer(model, 48000)
def __init__(self, user_connection, kaldi_server): self.__resampler = AudioResampler(format='s16', layout='mono', rate=kaldi_server.samplerate) self.__pc = user_connection self.__audio_task = None self.__text_task = None self.__ks = kaldi_server self.__kaldi_reader = None self.__kaldi_writer = None self.__channel = None
class KaldiTask: def __init__(self, user_connection): self.__resampler = AudioResampler(format='s16', layout='mono', rate=8000) self.__pc = user_connection self.__audio_task = None self.__track = None self.__channel = None self.__recognizer = KaldiRecognizer(model, 48000) async def set_audio_track(self, track): self.__track = track async def set_text_channel(self, channel): self.__channel = channel async def start(self): self.__audio_task = asyncio.create_task(self.__run_audio_xfer()) async def stop(self): if self.__audio_task is not None: self.__audio_task.cancel() self.__audio_task = None async def __run_audio_xfer(self): while True: frame = await self.__track.recv() print ("Got frame", frame) frame = self.__resampler.resample(frame) data = frame.to_ndarray() response = await loop.run_in_executor(pool, process_chunk, self.__recognizer, data.tobytes()) print ("Sending response", response) self.__channel.send(response)
def play(self, address, sender=None): container = av.open(address) stream = next(stream for stream in container.streams if stream.type == 'audio') if not stream: return self._stream = (container, stream) self._resampler = AudioResampler('s16p', 1, 48000)
def __init__(self, user_connection, kaldi_server): self.__resampler = AudioResampler(format='s16', layout='mono', rate=kaldi_server.samplerate) self.__pc = user_connection self.__audio_task = None self.__text_task = None self.__ks = kaldi_server self.__kaldi_reader = None self.__kaldi_writer = None self.__channel = None self.__command = list() with open('/server/command.txt', 'r') as file: for line in file: self.__command.append(line.strip().split()) print(self.__command)
class KaldiTask: def __init__(self, user_connection): self.__resampler = AudioResampler(format='s16', layout='mono', rate=48000) self.__pc = user_connection self.__audio_task = None self.__track = None self.__channel = None self.__recognizer = KaldiRecognizer(model, 48000) async def set_audio_track(self, track): self.__track = track async def set_text_channel(self, channel): self.__channel = channel async def start(self): self.__audio_task = asyncio.create_task(self.__run_audio_xfer()) async def stop(self): if self.__audio_task is not None: self.__audio_task.cancel() self.__audio_task = None async def __run_audio_xfer(self): dataframes = bytearray(b"") while True: frame = await self.__track.recv() frame = self.__resampler.resample(frame) max_frames_len = 8000 message = frame.planes[0].to_bytes() recv_frames = bytearray(message) dataframes += recv_frames if len(dataframes) > max_frames_len: wave_bytes = bytes(dataframes) response = await loop.run_in_executor(pool, process_chunk, self.__recognizer, wave_bytes) print(response) self.__channel.send(response) dataframes = bytearray(b"")
def audio_encoding(self, codec): if not codec in av.codec.codecs_availible: raise SkipTest() encoder = av.Encoder(codec) if encoder.codec.experimental: raise SkipTest() sample_fmt = encoder.codec.audio_formats[-1].name sample_rate = 48000 channel_layout = "stereo" channels = 2 encoder.time_base = sample_rate encoder.sample_rate = sample_rate encoder.sample_fmt = sample_fmt encoder.channels = channels encoder.open() resampler = AudioResampler(sample_fmt, channel_layout, sample_rate) container = av.open(fate_suite('audio-reference/chorusnoise_2ch_44kHz_s16.wav')) audio_stream = next(s for s in container.streams if s.type == 'audio') path = self.sandboxed('encoder.%s' % codec) samples = 0 packet_sizes = [] test_bad = True with open(path, 'w') as f: for frame in iter_frames(container, audio_stream): if test_bad: bad_resampler = AudioResampler(sample_fmt, "mono", sample_rate) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) bad_resampler = AudioResampler(sample_fmt, channel_layout, 3000) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) bad_resampler = AudioResampler('u8', channel_layout, 3000) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) test_bad = False resampled_frame = resampler.resample(frame) samples += resampled_frame.samples for new_packet in encoder.encode(resampled_frame): # bytearray because python can # freaks out if the first byte is NULL f.write(bytearray(new_packet)) packet_sizes.append(new_packet.size) for new_packet in encoder.flush(): packet_sizes.append(new_packet.size) f.write(bytearray(new_packet)) decoder = av.Decoder(codec) decoder.time_base = sample_rate decoder.sample_rate = sample_rate decoder.sample_fmt = sample_fmt decoder.channels = channels decoder.open() result_samples = 0 # should have more asserts but not sure what to check # libav and ffmpeg give different results # so can really use checksums for frame in iter_raw_frames(path, packet_sizes, decoder): result_samples += frame.samples self.assertEqual(frame.rate, sample_rate) self.assertEqual(len(frame.layout.channels), channels)
def audio_encoding(self, codec_name): try: codec = Codec(codec_name, 'w') except UnknownCodecError: raise SkipTest() ctx = codec.create() if ctx.codec.experimental: raise SkipTest() sample_fmt = ctx.codec.audio_formats[-1].name sample_rate = 48000 channel_layout = "stereo" channels = 2 ctx.time_base = Fraction(1) / sample_rate ctx.sample_rate = sample_rate ctx.format = sample_fmt ctx.layout = channel_layout ctx.channels = channels ctx.open() resampler = AudioResampler(sample_fmt, channel_layout, sample_rate) container = av.open( fate_suite('audio-reference/chorusnoise_2ch_44kHz_s16.wav')) audio_stream = container.streams.audio[0] path = self.sandboxed('encoder.%s' % codec_name) samples = 0 packet_sizes = [] test_bad = False with open(path, 'wb') as f: for frame in iter_frames(container, audio_stream): # We need to let the encoder retime. frame.pts = None if test_bad: bad_resampler = AudioResampler(sample_fmt, "mono", sample_rate) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) bad_resampler = AudioResampler(sample_fmt, channel_layout, 3000) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) bad_resampler = AudioResampler('u8', channel_layout, 3000) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) test_bad = False resampled_frame = resampler.resample(frame) samples += resampled_frame.samples for packet in ctx.encode(resampled_frame): # bytearray because python can # freaks out if the first byte is NULL f.write(bytearray(packet)) packet_sizes.append(packet.size) for packet in ctx.encode(None): packet_sizes.append(packet.size) f.write(bytearray(packet)) ctx = Codec(codec_name, 'r').create() ctx.time_base = Fraction(1) / sample_rate ctx.sample_rate = sample_rate ctx.format = sample_fmt ctx.layout = channel_layout ctx.channels = channels ctx.open() result_samples = 0 # should have more asserts but not sure what to check # libav and ffmpeg give different results # so can really use checksums for frame in iter_raw_frames(path, packet_sizes, ctx): result_samples += frame.samples self.assertEqual(frame.rate, sample_rate) self.assertEqual(len(frame.layout.channels), channels)
class KaldiSink: """ This class is a proxy between the client browser (aka peer connection) and the Kaldi server. It creates 2 tasks that transfer data between the two: 1. __run_audio_xfer transfers audio from the browser (mic) to the Kaldi server 2. __run_text_xfer transfers text from the Kaldi server to the browser """ def __init__(self, user_connection, kaldi_server): self.__resampler = AudioResampler(format='s16', layout='mono', rate=kaldi_server.samplerate) self.__pc = user_connection self.__audio_task = None self.__text_task = None self.__ks = kaldi_server self.__kaldi_reader = None self.__kaldi_writer = None self.__channel = None async def set_audio_track(self, track): self.__track = track async def set_text_channel(self, channel): self.__channel = channel async def start(self): try: self.__kaldi_reader, self.__kaldi_writer = await open_connection( host=self.__ks.host, port=self.__ks.port) except: log.exception("Error opening conenction to Kaldi server") self.__pc.close() await self.__ks.free() return log.info( f'Connected to Kaldi server {self.__ks.host}:{self.__ks.port}...') self.__audio_task = create_task(self.__run_audio_xfer()) self.__text_task = create_task(self.__run_text_xfer()) async def stop(self): if self.__audio_task is not None: self.__audio_task.cancel() self.__audio_task = None if self.__text_task is not None: self.__text_task.cancel() self.__text_task = None if self.__kaldi_writer: self.__kaldi_writer.close() self.__kaldi_writer = None await self.__ks.free() async def __run_audio_xfer(self): while True: try: frame = await self.__track.recv() frame = self.__resampler.resample(frame) data = frame.to_ndarray() self.__kaldi_writer.write(data.tobytes()) await self.__kaldi_writer.drain( ) #without this we won't catch any write exceptions except: self.__kaldi_writer.close() await self.__ks.free() return async def __run_text_xfer(self): await sleep(1) # this is useful to self.__channel.send( '<s>\r' ) # this is only sent to inform the web UI we are ready to send data # since the above token doesn't end with \n it will be erased once Kaldi recognizes something while True: a = await self.__kaldi_reader.read(256) self.__channel.send(str(a, encoding='utf-8'))
class KaldiSink: """ This class is a proxy between the client browser (aka peer connection) and the Kaldi server. It creates 2 tasks that transfer data between the two: 1. __run_audio_xfer transfers audio from the browser (mic) to the Kaldi server 2. __run_text_xfer transfers text from the Kaldi server to the browser """ def __init__(self, user_connection, kaldi_server): self.__resampler = AudioResampler(format='s16', layout='mono', rate=kaldi_server.samplerate) self.__pc = user_connection self.__audio_task = None self.__text_task = None self.__ks = kaldi_server self.__kaldi_reader = None self.__kaldi_writer = None self.__channel = None self.__command = list() with open('/server/command.txt', 'r') as file: for line in file: self.__command.append(line.strip().split()) print(self.__command) async def set_audio_track(self, track): self.__track = track async def set_text_channel(self, channel): self.__channel = channel async def start(self): try: self.__kaldi_reader, self.__kaldi_writer = await open_connection( host=self.__ks.host, port=self.__ks.port) except: log.exception("Error opening conenction to Kaldi server") self.__pc.close() await self.__ks.free() return log.info( f'Connected to Kaldi server {self.__ks.host}:{self.__ks.port}...') self.__audio_task = create_task(self.__run_audio_xfer()) self.__text_task = create_task(self.__run_text_xfer()) async def stop(self): if self.__audio_task is not None: self.__audio_task.cancel() self.__audio_task = None if self.__text_task is not None: self.__text_task.cancel() self.__text_task = None if self.__kaldi_writer: self.__kaldi_writer.close() self.__kaldi_writer = None await self.__ks.free() async def __run_audio_xfer(self): while True: try: frame = await self.__track.recv() frame = self.__resampler.resample(frame) data = frame.to_ndarray() data2 = data.tobytes() self.__kaldi_writer.write(data2) await self.__kaldi_writer.drain( ) #without this we won't catch any write exceptions except: self.__kaldi_writer.close() await self.__ks.free() return @staticmethod def dist(x, y): if x == y: return 0 elif x[0] == y[0]: return abs(len(x) - len(y)) / (len(x) + len(y)) * 0.25 + 0.5 else: return abs(len(x) - len(y)) / (len(x) + len(y)) * 0.5 + 0.5 @staticmethod def DTW(s1, s2): arr = np.zeros((len(s1) + 1, len(s2) + 1)) arr[:, :] = np.inf for i in range(1, len(s1) + 1): for j in range(1, len(s2) + 1): row = i - 1 col = j - 1 if i == 1 and j == 1: arr[i, j] = dist(s1[row], s2[col]) continue arr[i, j] = min(arr[i, j-1] + dist(s1[row],s2[col]),\ arr[i-1, j-1] + 2*dist(s1[row],s2[col]),\ arr[i-1, j] + dist(s1[row],s2[col])) return arr[-1, -1] @staticmethod def __find_best_match(test, all_command): mn = np.inf best_match = '' for cmd in all_command: print('test', cmd) distance = DTW(cmd, test) if mn > distance: mn = distance best_match = cmd return best_match, distance async def __run_text_xfer(self): await sleep(1) # this is useful to self.__channel.send( '<s>\r' ) # this is only sent to inform the web UI we are ready to send data # since the above token doesn't end with \n it will be erased once Kaldi recognizes something while True: a = await self.__kaldi_reader.read(256) print(a) print('kaldi', str(a, encoding='utf-8')) t = str(a, encoding='utf-8').split(' ') print('after split', t) # b, d = self.__find_best_match(t, self.__command) # print('kaldi res', (' ').join(b)) self.__channel.send(str(a, encoding='utf-8'))
def audio_encoding(self, codec_name): try: codec = Codec(codec_name, 'w') except UnknownCodecError: raise SkipTest() ctx = codec.create() if ctx.codec.experimental: raise SkipTest() sample_fmt = ctx.codec.audio_formats[-1].name sample_rate = 48000 channel_layout = "stereo" channels = 2 ctx.time_base = Fraction(1) / sample_rate ctx.sample_rate = sample_rate ctx.format = sample_fmt ctx.layout = channel_layout ctx.channels = channels ctx.open() resampler = AudioResampler(sample_fmt, channel_layout, sample_rate) container = av.open(fate_suite('audio-reference/chorusnoise_2ch_44kHz_s16.wav')) audio_stream = container.streams.audio[0] path = self.sandboxed('encoder.%s' % codec_name) samples = 0 packet_sizes = [] test_bad = False with open(path, 'wb') as f: for frame in iter_frames(container, audio_stream): # We need to let the encoder retime. frame.pts = None if test_bad: bad_resampler = AudioResampler(sample_fmt, "mono", sample_rate) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) bad_resampler = AudioResampler(sample_fmt, channel_layout, 3000) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) bad_resampler = AudioResampler('u8', channel_layout, 3000) bad_frame = bad_resampler.resample(frame) with self.assertRaises(ValueError): next(encoder.encode(bad_frame)) test_bad = False resampled_frame = resampler.resample(frame) samples += resampled_frame.samples for packet in ctx.encode(resampled_frame): # bytearray because python can # freaks out if the first byte is NULL f.write(bytearray(packet)) packet_sizes.append(packet.size) for packet in ctx.encode(None): packet_sizes.append(packet.size) f.write(bytearray(packet)) ctx = Codec(codec_name, 'r').create() ctx.time_base = Fraction(1) / sample_rate ctx.sample_rate = sample_rate ctx.format = sample_fmt ctx.layout = channel_layout ctx.channels = channels ctx.open() result_samples = 0 # should have more asserts but not sure what to check # libav and ffmpeg give different results # so can really use checksums for frame in iter_raw_frames(path, packet_sizes, ctx): result_samples += frame.samples self.assertEqual(frame.rate, sample_rate) self.assertEqual(len(frame.layout.channels), channels)
from av.audio.resampler import AudioResampler import udp_ep import replier addr = '0.0.0.0' port = 8080 logger = logging.getLogger('http_ep') pcs = set() datagram_endpoint = udp_ep.Endpoint() target_audio_format = AudioFormat('s16') target_audio_layout = AudioLayout('mono') target_sample_rate = 44100 resampler = AudioResampler(target_audio_format, target_audio_layout, target_sample_rate) async def handler(request): uri = str(request.rel_url) logger.info('Request: {}'.format(uri)) if uri == '/start': udp_ep.status = "start" return web.Response(text=udp_ep.status) elif uri == '/stop': udp_ep.status = "stop" return web.Response(text=udp_ep.status) elif uri == '/status': return web.Response(text=udp_ep.status) elif uri == '/offer':