def test_iterator(): # valid file decoder = MP3Decoder(open('tests/streamp3/data/stereo.mp3', 'rb')) for chunk in decoder: assert isinstance(chunk, bytes) assert len(chunk) % 4 == 0 # truncated file decoder = MP3Decoder(open('tests/streamp3/data/truncated.mp3', 'rb')) while True: chunk = decoder.read() if chunk: assert isinstance(chunk, bytes) assert len(chunk) % 4 == 0 else: break
def synthesize( self, utterance: str, mode: str = "text", voice: str = "demo-male", profile: str = "default", ) -> None: """Synthesizes the given utterance with the voice and format provided. Text can be formatted as plain text (`mode="text"`), SSML (`mode="ssml"`), or Speech Markdown (`mode="markdown"`). This method also supports different formats for the synthesized audio via the `profile` argument. The supported profiles and their associated formats are: Args: utterance (str): string that needs to be rendered as speech. mode (str): synthesis mode to use with utterance. text, ssml, markdown, etc. voice (str): name of the tts voice. profile (str): name of the audio profile used to create the resulting stream. """ stream = self._client.synthesize(utterance, mode, voice, profile) if self._format == FORMAT_MP3: # decode the sequence of MP3 frames stream = SequenceIO(stream) for frame in MP3Decoder(stream): self._output.write(frame) elif self._format == FORMAT_PCM16: # write the raw audio to the output for frame in stream: self._output.write(frame.tobytes())
def test_read(): # stereo read decoder = MP3Decoder(open('tests/streamp3/data/stereo.mp3', 'rb')) while True: chunk = decoder.read() if chunk: assert isinstance(chunk, bytes) assert len(chunk) % 4 == 0 else: break # mono read decoder = MP3Decoder(open('tests/streamp3/data/mono.mp3', 'rb')) while True: chunk = decoder.read() if chunk: assert isinstance(chunk, bytes) assert len(chunk) % 2 == 0 else: break
def test_properties(): # constant bit rate decoder = MP3Decoder(open('tests/streamp3/data/cbr.mp3', 'rb')) assert decoder.bit_rate == 128000 assert decoder.sample_rate == 44100 assert decoder.num_channels == 2 # variable bit rate decoder = MP3Decoder(open('tests/streamp3/data/vbr.mp3', 'rb')) assert decoder.bit_rate == 128000 decoder.read() assert decoder.bit_rate == 32000 assert decoder.sample_rate == 44100 assert decoder.num_channels == 2 # mono decoder = MP3Decoder(open('tests/streamp3/data/mono.mp3', 'rb')) assert decoder.bit_rate == 32000 assert decoder.sample_rate == 16000 assert decoder.num_channels == 1
def test_iterator_with_copy(): decoder = MP3Decoder(open('tests/streamp3/data/stereo.mp3', 'rb'), provide_copy=True) raw_data = b'' for chunk, raw in decoder: raw_data += raw assert isinstance(chunk, bytes) assert len(chunk) % 4 == 0 ref_data = open('tests/streamp3/data/stereo.mp3', 'rb').read() assert len(raw_data) == len(ref_data) assert raw_data == ref_data
def synthesize(self, utterance: str, mode: str, voice: str) -> None: """Synthesizes the given utterance with the voice and format provided. Args: utterance (str): string that needs to be rendered as speech. mode (str): synthesis mode to use with utterance. text, ssml, markdown, etc. voice (str): name of the tts voice. """ stream = self._client.synthesize(utterance, mode, voice) stream = SequenceIO(stream) for frame in MP3Decoder(stream): self._output.write(frame)
def test_construction(): # invalid stream with pytest.raises(Exception): MP3Decoder(BytesIO(b'')) with pytest.raises(Exception): MP3Decoder(b'') with pytest.raises(Exception): MP3Decoder(b'invalid') with pytest.raises(Exception): MP3Decoder(open('tests/streamp3/data/id3only.mp3', 'rb')) with pytest.raises(Exception): MP3Decoder(bytes([0xFF, 0xFF, 0xFF, 0xFF])) # valid bytes MP3Decoder(open('tests/streamp3/data/noid3.mp3', 'rb').read()) # valid stream, no ID3 MP3Decoder(open('tests/streamp3/data/noid3.mp3', 'rb')) # valid stream with ID3 MP3Decoder(open('tests/streamp3/data/withid3.mp3', 'rb'))
def construct_pcm(audio): """Construct PCM data, appropriately spaced in time, given the provided audio object. Args: audio: A dictionary with fields 'data' (mp3 data received) and 'data2time' (which maps the time of message receipt and data messages to each other) """ # find headers to be sure that a header is valid, one must check # the rest of the frame. OMG, who designed MP3? why is there no # reserved word? Like limit the data to only allow 0x1*bit_depth at # the start of the header. Dumb data_arr = np.array(audio['data']) headers = np.where((data_arr[0:-3] == 255) * (data_arr[1:-2] == 243) * (np.right_shift(data_arr[2:-1], 4) != 0x0F) * (np.bitwise_and(data_arr[2:-1], 0x0C) == 0x08) * (data_arr[3:] == 196))[0] padding = (data_arr[headers+2] & 0b10) data2time = np.array(audio['data2time']) if not np.isin(data2time[:, 0], headers).all(): raise RuntimeError( 'Some messages did not start with well formed headers') # TODO: handle messed up messages gracefully # TODO: check frame lengths frame_size = ((576 / 8 * (BITRATES[data_arr[headers+2] >> 4]))/16)+padding # always 576 samples/frame for V2 Layer III stereo and 1152 for mono? # PyDub # For whatever reason FFMPEG expects frames of framesize + header size. # I think it should just be frame size, I could pad it or something, but # that would change the underlying compression output: # ipdb> AudioSegment.from_file(io.BytesIO(bytes(audio['data'][0:432+4]))) # streamp3 # creates chunks of 16 bit PCM, but ends up missing two chunks? # trying to read with only first frame or only last frame just gives nothing back. padding = audio['data'][headers[0]:headers[0]+4]+[0]*int(frame_size[0]-4) # padding is needed to make the underlying stuff works. Here is an idea # as to why that might be: https://thebreakfastpost.com/2016/11/26/ # mp3-decoding-with-the-mad-library-weve-all-been-doing-it-wrong/ mp3_decoder = MP3Decoder(bytes(padding+audio['data']+padding)) all_chunks = list(mp3_decoder) # the first frame will just be the padding coming back (doesn't make sense to me either..) all_chunks = all_chunks[1:] # # int_chunks = [[(chunks_arr_elem[idx*2] << 8) | chunks_arr_elem[idx*2+1] # for idx in range(int(len(chunks_arr_elem)/2))] # for chunks_arr_elem in all_chunks] assert mp3_decoder.sample_rate == 16000 assert mp3_decoder.num_channels == 1 # assert (~(np.array([len(chunk) for chunk in int_chunks]) != 576)).all() LOGGER.info( 'bit rate: %i, sample rate: %i, num channels: %i', mp3_decoder.bit_rate, mp3_decoder.sample_rate, mp3_decoder.num_channels) LOGGER.info( '%i valid headers found, but LAME only reads %i chunks', len(headers), len(all_chunks)) filled_data = [] end_time = audio['start'] for frame_idx, this_chunk in enumerate(all_chunks): data2time = audio['data2time'][frame_idx] diff = data2time[1] - end_time if diff > 0: add_samples = int(diff*16000) end_time += add_samples/16000 filled_data += (bytes([0, 0]*add_samples)) filled_data += (this_chunk) end_time += 576/16000 # length = len(filled_data)/mp3_decoder.sample_rate return [int.from_bytes(bytes(filled_data[idx*2:idx*2+2]), byteorder='little', signed=True) for idx in range(int(len(filled_data)/2))]