def process_record(wav_data, ns, example_id, min_length=5, max_length=20, sample_rate=16000, allow_empty_notesequence=False): """Split a record into chunks and create an example proto. To use the full length audio and notesequence, set min_length=0 and max_length=-1. Args: wav_data: audio data in WAV format. ns: corresponding NoteSequence. example_id: id for the example proto min_length: minimum length in seconds for audio chunks. max_length: maximum length in seconds for audio chunks. sample_rate: desired audio sample rate. allow_empty_notesequence: whether an empty NoteSequence is allowed. Yields: Example protos. """ samples = audio_io.wav_data_to_samples(wav_data, sample_rate) samples = librosa.util.normalize(samples, norm=np.inf) if max_length == min_length: splits = np.arange(0, ns.total_time, max_length) elif max_length > 0: splits = find_split_points(ns, samples, sample_rate, min_length, max_length) else: splits = [0, ns.total_time] velocities = [note.velocity for note in ns.notes] velocity_max = np.max(velocities) if velocities else 0 velocity_min = np.min(velocities) if velocities else 0 velocity_range = music_pb2.VelocityRange(min=velocity_min, max=velocity_max) for start, end in zip(splits[:-1], splits[1:]): if end - start < min_length: continue if start == 0 and end == ns.total_time: new_ns = ns else: new_ns = sequences_lib.extract_subsequence(ns, start, end) if not new_ns.notes and not allow_empty_notesequence: tf.logging.warning('skipping empty sequence') continue if start == 0 and end == ns.total_time: new_samples = samples else: # the resampling that happen in crop_wav_data is really slow # and we've already done it once, avoid doing it twice new_samples = audio_io.crop_samples(samples, sample_rate, start, end - start) new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate) yield create_example( example_id, new_ns, new_wav_data, velocity_range=velocity_range)
def generate_test_set(): """Generate the test TFRecord.""" test_file_pairs = [] for directory in test_dirs: path = os.path.join(FLAGS.input_dir, directory) path = os.path.join(path, '*.wav') wav_files = glob.glob(path) # find matching mid files for wav_file in wav_files: base_name_root, _ = os.path.splitext(wav_file) mid_file = base_name_root + '.mid' test_file_pairs.append((wav_file, mid_file)) test_output_name = os.path.join(FLAGS.output_dir, 'maps_config2_test.tfrecord') with tf.python_io.TFRecordWriter(test_output_name) as writer: for idx, pair in enumerate(test_file_pairs): print('{} of {}: {}'.format(idx, len(test_file_pairs), pair[0])) # load the wav data and resample it. samples = audio_io.load_audio(pair[0], FLAGS.sample_rate) wav_data = audio_io.samples_to_wav_data(samples, FLAGS.sample_rate) # load the midi data and convert to a notesequence ns = midi_io.midi_file_to_note_sequence(pair[1]) example = split_audio_and_label_data.create_example(pair[0], ns, wav_data) writer.write(example.SerializeToString()) return [filename_to_id(wav) for wav, _ in test_file_pairs]
def _ValidateProvideBatchMemory(self, truncated_length, batch_size, lengths, expected_num_inputs): hparams = copy.deepcopy(constants.DEFAULT_HPARAMS) examples = [] expected_inputs = [] for i, length in enumerate(lengths): wav_samples = np.zeros( (np.int((length / data.hparams_frames_per_second(hparams)) * hparams.sample_rate), 1), np.float32) wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate) num_frames = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) seq = self._SyntheticSequence( num_frames / data.hparams_frames_per_second(hparams), i + constants.MIN_MIDI_PITCH) examples.append(self._FillExample(seq, wav_data, 'ex%d' % i)) expected_inputs += self._ExampleToInputs( examples[-1], truncated_length) self.assertEqual(expected_num_inputs, len(expected_inputs)) self._ValidateProvideBatch( [e.SerializeToString() for e in examples], truncated_length, batch_size, expected_inputs)
def _CreateExamplesAndExpectedInputs(self, truncated_length, lengths, expected_num_inputs): hparams = copy.deepcopy(configs.DEFAULT_HPARAMS) examples = [] expected_inputs = [] for i, length in enumerate(lengths): wav_samples = np.zeros( (np.int((length / data.hparams_frames_per_second(hparams)) * hparams.sample_rate), 1), np.float32) wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate) num_frames = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) seq = self._SyntheticSequence( num_frames / data.hparams_frames_per_second(hparams), i + constants.MIN_MIDI_PITCH) examples.append(self._FillExample(seq, wav_data, 'ex%d' % i)) expected_inputs += self._ExampleToInputs( examples[-1], truncated_length) self.assertEqual(expected_num_inputs, len(expected_inputs)) return examples, expected_inputs
def create_example(filename, hparams): """Processes an audio file into an Example proto.""" wav_data = librosa.core.load(filename, sr=hparams.sample_rate)[0] if hparams.normalize_audio: audio_io.normalize_wav_data(wav_data, hparams.sample_rate) wav_data = audio_io.samples_to_wav_data(wav_data, hparams.sample_rate) example = tf.train.Example(features=tf.train.Features(feature={ 'id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[filename.encode('utf-8')] )), 'sequence': tf.train.Feature(bytes_list=tf.train.BytesList( value=[music_pb2.NoteSequence().SerializeToString()] )), 'audio': tf.train.Feature(bytes_list=tf.train.BytesList( value=[wav_data] )), 'velocity_range': tf.train.Feature(bytes_list=tf.train.BytesList( value=[music_pb2.VelocityRange().SerializeToString()] )), })) return example.SerializeToString()
def create_example(self, filename, hparams): """Processes an audio file into an Example proto.""" wav_data = librosa.core.load(filename, sr=hparams.sample_rate)[0] if hparams.normalize_audio: audio_io.normalize_wav_data(wav_data, hparams.sample_rate) wav_data = audio_io.samples_to_wav_data(wav_data, hparams.sample_rate) example = tf.train.Example(features=tf.train.Features( feature={ 'id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[filename.encode('utf-8')])), 'sequence': tf.train.Feature(bytes_list=tf.train.BytesList( value=[music_pb2.NoteSequence().SerializeToString()])), 'audio': tf.train.Feature(bytes_list=tf.train.BytesList( value=[wav_data])), 'velocity_range': tf.train.Feature(bytes_list=tf.train.BytesList( value=[music_pb2.VelocityRange().SerializeToString()])), })) return example.SerializeToString()
def _ValidateProvideBatchMemory(self, truncated_length, batch_size, lengths, expected_num_inputs): hparams = copy.deepcopy(constants.DEFAULT_HPARAMS) examples = [] expected_inputs = [] for i, length in enumerate(lengths): wav_samples = np.zeros( (np.int((length / data.hparams_frames_per_second(hparams)) * hparams.sample_rate), 1), np.float32) wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate) num_frames = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) seq = self._SyntheticSequence( num_frames / data.hparams_frames_per_second(hparams), i + constants.MIN_MIDI_PITCH) examples.append(self._FillExample(seq, wav_data, 'ex%d' % i)) expected_inputs += self._ExampleToInputs(examples[-1], truncated_length) self.assertEqual(expected_num_inputs, len(expected_inputs)) self._ValidateProvideBatch([e.SerializeToString() for e in examples], truncated_length, batch_size, expected_inputs)
def validateProvideBatch_TFRecord(self, truncated_length, batch_size, lengths, expected_num_inputs): hparams = copy.deepcopy(constants.DEFAULT_HPARAMS) examples = [] expected_inputs = [] for i, length in enumerate(lengths): wav_samples = np.zeros( (np.int((length / data.hparams_frames_per_second(hparams)) * constants.DEFAULT_SAMPLE_RATE), 1), np.float32) wav_data = audio_io.samples_to_wav_data( wav_samples, constants.DEFAULT_SAMPLE_RATE) num_frames = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) seq = self._SyntheticSequence( num_frames / data.hparams_frames_per_second(hparams), i + constants.MIN_MIDI_PITCH) examples.append(self._FillExample(seq, wav_data, 'ex%d' % i)) expected_inputs += self._ExampleToInputs(examples[-1], truncated_length) self.assertEqual(expected_num_inputs, len(expected_inputs)) with tempfile.NamedTemporaryFile() as temp_rio: with tf.python_io.TFRecordWriter(temp_rio.name) as writer: for ex in examples: writer.write(ex.SerializeToString()) self.validateProvideBatch(temp_rio.name, truncated_length, batch_size, expected_inputs)
def generate_test_set(): """Generate the test TFRecord.""" test_file_pairs = [] for directory in test_dirs: path = os.path.join(FLAGS.input_dir, directory) path = os.path.join(path, '*.wav') wav_files = glob.glob(path) # find matching mid files for wav_file in wav_files: base_name_root, _ = os.path.splitext(wav_file) mid_file = base_name_root + '.mid' test_file_pairs.append((wav_file, mid_file)) test_output_name = os.path.join(FLAGS.output_dir, 'maps_config2_test.tfrecord') with tf.python_io.TFRecordWriter(test_output_name) as writer: for idx, pair in enumerate(test_file_pairs): print('{} of {}: {}'.format(idx, len(test_file_pairs), pair[0])) # load the wav data and resample it. samples = audio_io.load_audio(pair[0], FLAGS.sample_rate) wav_data = audio_io.samples_to_wav_data(samples, FLAGS.sample_rate) # load the midi data and convert to a notesequence ns = midi_io.midi_file_to_note_sequence(pair[1]) example = audio_label_data_utils.create_example( pair[0], ns, wav_data) writer.write(example.SerializeToString()) return [filename_to_id(wav) for wav, _ in test_file_pairs]
def generate_test_set(): """Generate the test TFRecord.""" test_file_pairs = [] for directory in test_dirs: path = os.path.join(FLAGS.input_dir, directory) path = os.path.join(path, '*.wav') wav_files = glob.glob(path) # find matching mid files for wav_file in wav_files: base_name_root, _ = os.path.splitext(wav_file) mid_file = base_name_root + '.mid' test_file_pairs.append((wav_file, mid_file)) test_output_name = os.path.join(FLAGS.output_dir, 'maps_config2_test.tfrecord') with tf.python_io.TFRecordWriter(test_output_name) as writer: for pair in test_file_pairs: print(pair) # load the wav data and resample it. samples = audio_io.load_audio(pair[0], FLAGS.sample_rate) wav_data = audio_io.samples_to_wav_data(samples, FLAGS.sample_rate) # load the midi data and convert to a notesequence midi_data = tf.gfile.Open(pair[1]).read() ns = midi_io.midi_to_sequence_proto(midi_data) velocities = [note.velocity for note in ns.notes] velocity_max = np.max(velocities) velocity_min = np.min(velocities) new_velocity_tuple = music_pb2.VelocityRange( min=velocity_min, max=velocity_max) example = tf.train.Example(features=tf.train.Features(feature={ 'id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[pair[0]] )), 'sequence': tf.train.Feature(bytes_list=tf.train.BytesList( value=[ns.SerializeToString()] )), 'audio': tf.train.Feature(bytes_list=tf.train.BytesList( value=[wav_data] )), 'velocity_range': tf.train.Feature(bytes_list=tf.train.BytesList( value=[new_velocity_tuple.SerializeToString()] )), })) writer.write(example.SerializeToString()) return [filename_to_id(wav) for wav, _ in test_file_pairs]
def process(self, paths): wav_path, midi_path = paths if midi_path: if FLAGS.use_midi_stems: base_ns = note_sequence_from_directory( os.path.dirname(midi_path)) else: base_ns = midi_io.midi_file_to_note_sequence(midi_path) base_ns.filename = midi_path else: base_ns = music_pb2.NoteSequence() logging.info('Creating Example %s:%s', midi_path, wav_path) if FLAGS.convert_flac: samples, sr = librosa.load(wav_path, FLAGS.sample_rate) wav_data = audio_io.samples_to_wav_data(samples, sr) else: wav_data = tf.io.gfile.GFile(wav_path, 'rb').read() ns = copy.deepcopy(base_ns) # Use base names. ns.id = '%s:%s' % (wav_path, midi_path) Metrics.counter('create_example', 'read_midi_wav').inc() if FLAGS.max_length > 0: split_examples = audio_label_data_utils.process_record( wav_data, ns, ns.id, min_length=FLAGS.min_length, max_length=FLAGS.max_length, sample_rate=FLAGS.sample_rate, load_audio_with_librosa=False) for example in split_examples: Metrics.counter('split_wav', 'split_example').inc() yield example else: example = audio_label_data_utils.create_example( ns.id, ns, wav_data) Metrics.counter('create_example', 'created_example').inc() yield example
def _ValidateProvideBatchTFRecord(self, truncated_length, batch_size, lengths, expected_num_inputs, crop_sequence_secs=0): hparams = copy.deepcopy(constants.DEFAULT_HPARAMS) examples = [] expected_inputs = [] for i, length in enumerate(lengths): wav_samples = np.zeros( (np.int((length / data.hparams_frames_per_second(hparams)) * hparams.sample_rate), 1), np.float32) wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate) num_frames = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) seq = self._SyntheticSequence( num_frames / data.hparams_frames_per_second(hparams) - crop_sequence_secs * 2, # crop from both ends. i + constants.MIN_MIDI_PITCH, start_time=crop_sequence_secs) examples.append(self._FillExample(seq, wav_data, 'ex%d' % i)) expected_inputs += self._ExampleToInputs( examples[-1], truncated_length, crop_training_sequence_to_notes=crop_sequence_secs > 0) self.assertEqual(expected_num_inputs, len(expected_inputs)) with tempfile.NamedTemporaryFile() as temp_tfr: with tf.python_io.TFRecordWriter(temp_tfr.name) as writer: for ex in examples: writer.write(ex.SerializeToString()) self._ValidateProvideBatch( temp_tfr.name, truncated_length, batch_size, expected_inputs, crop_training_sequence_to_notes=crop_sequence_secs > 0)
def mix_examples(mixid_exs, sample_rate, load_audio_with_librosa): """Mix several Examples together to create a new example.""" mixid, exs = mixid_exs del mixid example_samples = [] example_sequences = [] for ex_str in exs: ex = tf.train.Example.FromString(ex_str) wav_data = ex.features.feature['audio'].bytes_list.value[0] if load_audio_with_librosa: samples = audio_io.wav_data_to_samples_librosa( wav_data, sample_rate) else: samples = audio_io.wav_data_to_samples(wav_data, sample_rate) example_samples.append(samples) ns = music_pb2.NoteSequence.FromString( ex.features.feature['sequence'].bytes_list.value[0]) example_sequences.append(ns) mixed_samples, mixed_sequence = audio_label_data_utils.mix_sequences( individual_samples=example_samples, sample_rate=sample_rate, individual_sequences=example_sequences) mixed_wav_data = audio_io.samples_to_wav_data(mixed_samples, sample_rate) mixed_id = '::'.join(['mixed'] + [ns.id for ns in example_sequences]) mixed_sequence.id = mixed_id mixed_filename = '::'.join(['mixed'] + [ns.filename for ns in example_sequences]) mixed_sequence.filename = mixed_filename examples = list( audio_label_data_utils.process_record(mixed_wav_data, mixed_sequence, mixed_id, min_length=0, max_length=-1, sample_rate=sample_rate)) assert len(examples) == 1 return examples[0]
def validateProvideBatch_TFRecord(self, truncated_length, batch_size, lengths, expected_num_inputs): hparams = copy.deepcopy(constants.DEFAULT_HPARAMS) examples = [] expected_inputs = [] for i, length in enumerate(lengths): wav_samples = np.zeros( (np.int((length / data.hparams_frames_per_second(hparams)) * constants.DEFAULT_SAMPLE_RATE), 1), np.float32) wav_data = audio_io.samples_to_wav_data(wav_samples, constants.DEFAULT_SAMPLE_RATE) num_frames = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) seq = self._SyntheticSequence( num_frames / data.hparams_frames_per_second(hparams), i + constants.MIN_MIDI_PITCH) examples.append(self._FillExample(seq, wav_data, 'ex%d' % i)) expected_inputs += self._ExampleToInputs( examples[-1], truncated_length) self.assertEqual(expected_num_inputs, len(expected_inputs)) with tempfile.NamedTemporaryFile() as temp_rio: with tf.python_io.TFRecordWriter(temp_rio.name) as writer: for ex in examples: writer.write(ex.SerializeToString()) self.validateProvideBatch( temp_rio.name, truncated_length, batch_size, expected_inputs)
def reduce_audio_in_batch(tensor, hparams=None, is_training=True): instrument_count = hparams.timbre_training_max_instruments note_croppping_list = [] instrument_family_list = [] samples_list = [] max_length = 0 for i in range(instrument_count): pitch = tensor['pitch'][i] # Move the audio so there are different attack times. start_idx = tf.random.uniform((), minval=0, maxval=hparams.timbre_max_start_offset, dtype='int64') samples = K.concatenate( [tf.zeros(start_idx), tf.sparse.to_dense(tensor['audio'])[i]]) end_idx = ( start_idx + tf.py_function(_get_approx_note_length, [tf.sparse.to_dense(tensor['audio'])[i]], tf.int64)) if hparams.timbre_max_len and end_idx > hparams.timbre_max_len: samples = tf.slice(samples, begin=[0], size=[hparams.timbre_max_len]) end_idx = hparams.timbre_max_len if len(samples) > max_length: max_length = len(samples) samples_list.append(samples) instrument_family = tensor['instrument_family'][i] note_croppping_list.append( timbre_dataset_util.NoteCropping(pitch=pitch, start_idx=start_idx, end_idx=end_idx)) instrument_family_list.append( tf.one_hot(tf.cast(instrument_family, tf.int32), hparams.timbre_num_classes)) # Pad the end of the shorter audio clips. samples_list = list( map(lambda x: tf.pad(x, [[0, max_length - len(x)]]), samples_list)) combined_samples = ( tf.reduce_sum(tf.convert_to_tensor(samples_list), axis=0) / instrument_count) # Ensure all audios in batches are the same length. if hparams.timbre_max_len: pad_length = hparams.timbre_max_len else: pad_length = hparams.timbre_max_start_offset + 5 * hparams.sample_rate combined_samples = tf.pad( combined_samples, [[0, pad_length - tf.shape(combined_samples)[0]]]) note_croppings = tf.convert_to_tensor(note_croppping_list, dtype=tf.int32) instrument_families = tf.convert_to_tensor(instrument_family_list, dtype=tf.int32) wav_data = tf.py_function( lambda x: audio_io.samples_to_wav_data( x.numpy(), sample_rate=hparams.sample_rate), [combined_samples], tf.string) return dict( audio=wav_data, note_croppings=note_croppings, instrument_families=instrument_families, )
def _CreateSyntheticExample(self): sequence = self._CreateSyntheticSequence() wav_samples = np.zeros(9 * SAMPLE_RATE, np.float32) wav_data = audio_io.samples_to_wav_data(wav_samples, SAMPLE_RATE) return wav_data, sequence
# })) # to_process.append(example.SerializeToString()) # print('Processing complete ') filename = 'richard.wav' file_dir = './data/wav_format/' + filename content = open(file_dir, 'rb') uploaded = {file_dir: content.read()} ##todo to_process = [] for fn in uploaded.keys(): print('User uploaded file "{name}" with length {length} bytes'.format( name=fn, length=len(uploaded[fn]))) open(fn, 'w').write(uploaded[fn]) wav_data = audio_io.samples_to_wav_data( librosa.util.normalize( librosa.core.load(fn, sr=hparams.sample_rate)[0]), hparams.sample_rate) example = tf.train.Example(features=tf.train.Features( feature={ 'id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[fn.encode('utf-8')])), 'sequence': tf.train.Feature(bytes_list=tf.train.BytesList( value=[music_pb2.NoteSequence().SerializeToString()])), 'audio': tf.train.Feature(bytes_list=tf.train.BytesList(value=[wav_data])), })) to_process.append(example.SerializeToString()) print('Processing complete for', fn)
def _CreateSyntheticExample(self): sequence = self._CreateSyntheticSequence() wav_samples = np.zeros(2 * SAMPLE_RATE, np.float32) wav_data = audio_io.samples_to_wav_data(wav_samples, SAMPLE_RATE) return wav_data, sequence
def process_record(wav_data, ns, example_id, min_length=5, max_length=20, sample_rate=16000, allow_empty_notesequence=False, load_audio_with_librosa=False): """Split a record into chunks and create an example proto. To use the full length audio and notesequence, set min_length=0 and max_length=-1. Args: wav_data: audio data in WAV format. ns: corresponding NoteSequence. example_id: id for the example proto min_length: minimum length in seconds for audio chunks. max_length: maximum length in seconds for audio chunks. sample_rate: desired audio sample rate. allow_empty_notesequence: whether an empty NoteSequence is allowed. load_audio_with_librosa: Use librosa for sampling. Works with 24-bit wavs. Yields: Example protos. """ try: if load_audio_with_librosa: samples = audio_io.wav_data_to_samples_librosa( wav_data, sample_rate) else: samples = audio_io.wav_data_to_samples(wav_data, sample_rate) except audio_io.AudioIOReadError as e: print('Exception %s', e) return samples = librosa.util.normalize(samples, norm=np.inf) # Add padding to samples if notesequence is longer. pad_to_samples = int(math.ceil(ns.total_time * sample_rate)) padding_needed = pad_to_samples - samples.shape[0] if padding_needed > 5 * sample_rate: raise ValueError( 'Would have padded {} more than 5 seconds to match note sequence total ' 'time. ({} original samples, {} sample rate, {} sample seconds, ' '{} sequence seconds) This likely indicates a problem with the source ' 'data.'.format(example_id, samples.shape[0], sample_rate, samples.shape[0] / sample_rate, ns.total_time)) samples = np.pad(samples, (0, max(0, padding_needed)), 'constant') if max_length == min_length: splits = np.arange(0, ns.total_time, max_length) elif max_length > 0: splits = find_split_points(ns, samples, sample_rate, min_length, max_length) else: splits = [0, ns.total_time] velocities = [note.velocity for note in ns.notes] velocity_max = np.max(velocities) if velocities else 0 velocity_min = np.min(velocities) if velocities else 0 velocity_range = music_pb2.VelocityRange(min=velocity_min, max=velocity_max) for start, end in zip(splits[:-1], splits[1:]): if end - start < min_length: continue if start == 0 and end == ns.total_time: new_ns = ns else: new_ns = sequences_lib.extract_subsequence(ns, start, end) if not new_ns.notes and not allow_empty_notesequence: tf.logging.warning('skipping empty sequence') continue if start == 0 and end == ns.total_time: new_samples = samples else: # the resampling that happen in crop_wav_data is really slow # and we've already done it once, avoid doing it twice new_samples = audio_io.crop_samples(samples, sample_rate, start, end - start) new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate) yield create_example(example_id, new_ns, new_wav_data, velocity_range=velocity_range)
def generate_train_set(exclude_ids): """Generate the train TFRecord.""" train_file_pairs = [] for directory in TRAIN_DIRS: path = os.path.join(FLAGS.input_dir, directory) path = os.path.join(path, '*.wav') wav_files = glob.glob(path) # find matching mid files for wav_file in wav_files: base_name_root, _ = os.path.splitext(wav_file) mid_file = base_name_root + '.mid' if filename_to_id(wav_file) not in exclude_ids: train_file_pairs.append((wav_file, mid_file)) train_output_name = os.path.join(FLAGS.output_dir, 'maps_config2_train.tfrecord') with tf.python_io.TFRecordWriter(train_output_name) as writer: for idx, pair in enumerate(train_file_pairs): print("{} of {}: {}".format(idx, len(train_file_pairs), pair[0])) # load the wav data wav_data = tf.gfile.Open(pair[0], 'rb').read() samples = audio_io.wav_data_to_samples(wav_data, FLAGS.sample_rate) norm_samples = librosa.util.normalize(samples, norm=np.inf) # load the midi data and convert to a notesequence ns = midi_io.midi_file_to_note_sequence(pair[1]) splits = create_dataset_util.find_split_points( ns, norm_samples, FLAGS.sample_rate, FLAGS.min_length, FLAGS.max_length) velocities = [note.velocity for note in ns.notes] velocity_max = np.max(velocities) velocity_min = np.min(velocities) new_velocity_tuple = music_pb2.VelocityRange(min=velocity_min, max=velocity_max) for start, end in zip(splits[:-1], splits[1:]): if end - start < FLAGS.min_length: continue new_ns = sequences_lib.extract_subsequence(ns, start, end) samples_start = int(start * FLAGS.sample_rate) samples_end = samples_start + int( (end - start) * FLAGS.sample_rate) new_samples = samples[samples_start:samples_end] new_wav_data = audio_io.samples_to_wav_data( new_samples, FLAGS.sample_rate) example = tf.train.Example(features=tf.train.Features( feature={ 'id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[pair[0].encode()])), 'sequence': tf.train.Feature(bytes_list=tf.train.BytesList( value=[new_ns.SerializeToString()])), 'audio': tf.train.Feature(bytes_list=tf.train.BytesList( value=[new_wav_data])), 'velocity_range': tf.train.Feature(bytes_list=tf.train.BytesList( value=[new_velocity_tuple.SerializeToString()])), })) writer.write(example.SerializeToString())