def testFloatWavDataToSamples(self): y = audio_io.wav_data_to_samples(self.wav_data, sample_rate=16000) wav_io = io.BytesIO() scipy.io.wavfile.write(wav_io, 16000, y) y_from_float = audio_io.wav_data_to_samples(wav_io.getvalue(), sample_rate=16000) np.testing.assert_array_equal(y, y_from_float)
def testWavDataToSamples(self): w = wave.open(self.wav_filename, 'rb') w_mono = wave.open(self.wav_filename_mono, 'rb') # Check content size. y = audio_io.wav_data_to_samples(self.wav_data, sample_rate=16000) y_mono = audio_io.wav_data_to_samples(self.wav_data_mono, sample_rate=22050) self.assertEqual( round(16000.0 * w.getnframes() / w.getframerate()), y.shape[0]) self.assertEqual( round(22050.0 * w_mono.getnframes() / w_mono.getframerate()), y_mono.shape[0]) # Check a few obvious failure modes. self.assertLess(0.01, y.std()) self.assertLess(0.01, y_mono.std()) self.assertGreater(-0.1, y.min()) self.assertGreater(-0.1, y_mono.min()) self.assertLess(0.1, y.max()) self.assertLess(0.1, y_mono.max())
def _wav_to_mel(wav_audio, hparams): """Transforms the contents of a wav file into a series of mel spec frames.""" y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate) mel = librosa.feature.melspectrogram(y, hparams.sample_rate, hop_length=hparams.spec_hop_length, fmin=hparams.spec_fmin, n_mels=hparams.spec_n_bins, htk=hparams.spec_mel_htk).astype( np.float32) # Transpose so that the data is in [frame, bins] format. mel = mel.T return mel
def _wav_to_cqt(wav_audio, hparams): """Transforms the contents of a wav file into a series of CQT frames.""" y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate) cqt = np.abs(librosa.core.cqt(y, hparams.sample_rate, hop_length=hparams.spec_hop_length, fmin=hparams.spec_fmin, n_bins=hparams.spec_n_bins, bins_per_octave=hparams.cqt_bins_per_octave), dtype=np.float32) # Transpose so that the data is in [frame, bins] format. cqt = cqt.T return cqt
def _wav_to_framed_samples(wav_audio, hparams): """Transforms the contents of a wav file into a series of framed samples.""" y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate) hl = hparams.spec_hop_length n_frames = int(np.ceil(y.shape[0] / hl)) frames = np.zeros((n_frames, hl), dtype=np.float32) # Fill in everything but the last frame which may not be the full length cutoff = (n_frames - 1) * hl frames[:n_frames - 1, :] = np.reshape(y[:cutoff], (n_frames - 1, hl)) # Fill the last frame remain_len = len(y[cutoff:]) frames[n_frames - 1, :remain_len] = y[cutoff:] return frames
def mix_examples(mixid_exs, sample_rate, load_audio_with_librosa): """Mix several Examples together to create a new example.""" mixid, exs = mixid_exs del mixid example_samples = [] example_sequences = [] for ex_str in exs: ex = tf.train.Example.FromString(ex_str) wav_data = ex.features.feature['audio'].bytes_list.value[0] if load_audio_with_librosa: samples = audio_io.wav_data_to_samples_librosa( wav_data, sample_rate) else: samples = audio_io.wav_data_to_samples(wav_data, sample_rate) example_samples.append(samples) ns = music_pb2.NoteSequence.FromString( ex.features.feature['sequence'].bytes_list.value[0]) example_sequences.append(ns) mixed_samples, mixed_sequence = audio_label_data_utils.mix_sequences( individual_samples=example_samples, sample_rate=sample_rate, individual_sequences=example_sequences) mixed_wav_data = audio_io.samples_to_wav_data(mixed_samples, sample_rate) mixed_id = '::'.join(['mixed'] + [ns.id for ns in example_sequences]) mixed_sequence.id = mixed_id mixed_filename = '::'.join(['mixed'] + [ns.filename for ns in example_sequences]) mixed_sequence.filename = mixed_filename examples = list( audio_label_data_utils.process_record(mixed_wav_data, mixed_sequence, mixed_id, min_length=0, max_length=-1, sample_rate=sample_rate)) assert len(examples) == 1 return examples[0]
def testSplitAudioLabelData(self): wav_data, sequence = self._CreateSyntheticExample() records = audio_label_data_utils.process_record( wav_data, sequence, 'test', sample_rate=SAMPLE_RATE) for record in records: audio = record.features.feature['audio'].bytes_list.value[0] velocity_range = music_pb2.VelocityRange.FromString( record.features.feature['velocity_range'].bytes_list.value[0]) note_sequence = music_pb2.NoteSequence.FromString( record.features.feature['sequence'].bytes_list.value[0]) expected_samples = np.zeros(10 * SAMPLE_RATE) np.testing.assert_array_equal( expected_samples, audio_io.wav_data_to_samples(audio, sample_rate=SAMPLE_RATE)) self.assertEqual(velocity_range.min, 20) self.assertEqual(velocity_range.max, 80) self.assertEqual(note_sequence.notes[0].velocity, 20) self.assertEqual(note_sequence.notes[0].end_time, 5.) self.assertEqual(note_sequence.notes[1].velocity, 80) self.assertEqual(note_sequence.notes[1].end_time, 10.)
def process_record(wav_data, ns, example_id, min_length=5, max_length=20, sample_rate=16000, allow_empty_notesequence=False, load_audio_with_librosa=False): """Split a record into chunks and create an example proto. To use the full length audio and notesequence, set min_length=0 and max_length=-1. Args: wav_data: audio data in WAV format. ns: corresponding NoteSequence. example_id: id for the example proto min_length: minimum length in seconds for audio chunks. max_length: maximum length in seconds for audio chunks. sample_rate: desired audio sample rate. allow_empty_notesequence: whether an empty NoteSequence is allowed. load_audio_with_librosa: Use librosa for sampling. Works with 24-bit wavs. Yields: Example protos. """ try: if load_audio_with_librosa: samples = audio_io.wav_data_to_samples_librosa(wav_data, sample_rate) else: samples = audio_io.wav_data_to_samples(wav_data, sample_rate) except audio_io.AudioIOReadError as e: print('Exception %s', e) return samples = librosa.util.normalize(samples, norm=np.inf) # Add padding to samples if notesequence is longer. pad_to_samples = int(math.ceil(ns.total_time * sample_rate)) padding_needed = pad_to_samples - samples.shape[0] if padding_needed > 5 * sample_rate: raise ValueError( 'Would have padded {} more than 5 seconds to match note sequence total ' 'time. ({} original samples, {} sample rate, {} sample seconds, ' '{} sequence seconds) This likely indicates a problem with the source ' 'data.'.format( example_id, samples.shape[0], sample_rate, samples.shape[0] / sample_rate, ns.total_time)) samples = np.pad(samples, (0, max(0, padding_needed)), 'constant') if max_length == min_length: splits = np.arange(0, ns.total_time, max_length) elif max_length > 0: splits = find_split_points(ns, samples, sample_rate, min_length, max_length) else: splits = [0, ns.total_time] velocity_range = velocity_range_from_sequence(ns) for start, end in zip(splits[:-1], splits[1:]): if end - start < min_length: continue if start == 0 and end == ns.total_time: new_ns = ns else: new_ns = sequences_lib.extract_subsequence(ns, start, end) if not new_ns.notes and not allow_empty_notesequence: tf.logging.warning('skipping empty sequence') continue if start == 0 and end == ns.total_time: new_samples = samples else: # the resampling that happen in crop_wav_data is really slow # and we've already done it once, avoid doing it twice new_samples = audio_io.crop_samples(samples, sample_rate, start, end - start) new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate) yield create_example( example_id, new_ns, new_wav_data, velocity_range=velocity_range)