def process_record(wav_data, ns, example_id, min_length=5, max_length=20, sample_rate=16000, allow_empty_notesequence=False): """Split a record into chunks and create an example proto. To use the full length audio and notesequence, set min_length=0 and max_length=-1. Args: wav_data: audio data in WAV format. ns: corresponding NoteSequence. example_id: id for the example proto min_length: minimum length in seconds for audio chunks. max_length: maximum length in seconds for audio chunks. sample_rate: desired audio sample rate. allow_empty_notesequence: whether an empty NoteSequence is allowed. Yields: Example protos. """ samples = audio_io.wav_data_to_samples(wav_data, sample_rate) samples = librosa.util.normalize(samples, norm=np.inf) if max_length == min_length: splits = np.arange(0, ns.total_time, max_length) elif max_length > 0: splits = find_split_points(ns, samples, sample_rate, min_length, max_length) else: splits = [0, ns.total_time] velocities = [note.velocity for note in ns.notes] velocity_max = np.max(velocities) if velocities else 0 velocity_min = np.min(velocities) if velocities else 0 velocity_range = music_pb2.VelocityRange(min=velocity_min, max=velocity_max) for start, end in zip(splits[:-1], splits[1:]): if end - start < min_length: continue if start == 0 and end == ns.total_time: new_ns = ns else: new_ns = sequences_lib.extract_subsequence(ns, start, end) if not new_ns.notes and not allow_empty_notesequence: tf.logging.warning('skipping empty sequence') continue if start == 0 and end == ns.total_time: new_samples = samples else: # the resampling that happen in crop_wav_data is really slow # and we've already done it once, avoid doing it twice new_samples = audio_io.crop_samples(samples, sample_rate, start, end - start) new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate) yield create_example( example_id, new_ns, new_wav_data, velocity_range=velocity_range)
def split2batch(audio, sequence): from magenta.models.onsets_frames_transcription.audio_label_data_utils import find_split_points pad_num = int(math.ceil( sequence.total_time * cfg.SAMPLE_RATE)) - audio.shape[0] if pad_num > 0: audio = np.concatenate((audio, np.zeros((pad_num), dtype=audio.dtype))) splits = [0, sequence.total_time] if cfg.MAX_SPLIT_LENGTH == 0 else \ find_split_points(sequence, audio, cfg.SAMPLE_RATE, cfg.MIN_SPLIT_LENGTH, cfg.MAX_SPLIT_LENGTH) samples = [] for start, end in zip(splits[:-1], splits[1:]): if end - start < cfg.MIN_SPLIT_LENGTH: continue split_audio, split_seq = audio, sequence if not (start == 0 and end == sequence.total_time): split_seq = sequences_lib.extract_subsequence(sequence, start, end) split_audio = audio_io.crop_samples(audio, cfg.SAMPLE_RATE, start, end - start) pad_num = int(math.ceil( cfg.MAX_SPLIT_LENGTH * cfg.SAMPLE_RATE)) - split_audio.shape[0] if pad_num > 0: split_audio = np.concatenate( (split_audio, np.zeros((pad_num), dtype=split_audio.dtype))) samples.append((split_audio, split_seq)) return samples
def process_record(wav_data, ns, example_id, min_length=5, max_length=20, sample_rate=16000, allow_empty_notesequence=False, load_audio_with_librosa=False): """Split a record into chunks and create an example proto. To use the full length audio and notesequence, set min_length=0 and max_length=-1. Args: wav_data: audio data in WAV format. ns: corresponding NoteSequence. example_id: id for the example proto min_length: minimum length in seconds for audio chunks. max_length: maximum length in seconds for audio chunks. sample_rate: desired audio sample rate. allow_empty_notesequence: whether an empty NoteSequence is allowed. load_audio_with_librosa: Use librosa for sampling. Works with 24-bit wavs. Yields: Example protos. """ try: if load_audio_with_librosa: samples = audio_io.wav_data_to_samples_librosa( wav_data, sample_rate) else: samples = audio_io.wav_data_to_samples(wav_data, sample_rate) except audio_io.AudioIOReadError as e: print('Exception %s', e) return samples = librosa.util.normalize(samples, norm=np.inf) # Add padding to samples if notesequence is longer. pad_to_samples = int(math.ceil(ns.total_time * sample_rate)) padding_needed = pad_to_samples - samples.shape[0] if padding_needed > 5 * sample_rate: raise ValueError( 'Would have padded {} more than 5 seconds to match note sequence total ' 'time. ({} original samples, {} sample rate, {} sample seconds, ' '{} sequence seconds) This likely indicates a problem with the source ' 'data.'.format(example_id, samples.shape[0], sample_rate, samples.shape[0] / sample_rate, ns.total_time)) samples = np.pad(samples, (0, max(0, padding_needed)), 'constant') if max_length == min_length: splits = np.arange(0, ns.total_time, max_length) elif max_length > 0: splits = find_split_points(ns, samples, sample_rate, min_length, max_length) else: splits = [0, ns.total_time] velocities = [note.velocity for note in ns.notes] velocity_max = np.max(velocities) if velocities else 0 velocity_min = np.min(velocities) if velocities else 0 velocity_range = music_pb2.VelocityRange(min=velocity_min, max=velocity_max) for start, end in zip(splits[:-1], splits[1:]): if end - start < min_length: continue if start == 0 and end == ns.total_time: new_ns = ns else: new_ns = sequences_lib.extract_subsequence(ns, start, end) if not new_ns.notes and not allow_empty_notesequence: tf.logging.warning('skipping empty sequence') continue if start == 0 and end == ns.total_time: new_samples = samples else: # the resampling that happen in crop_wav_data is really slow # and we've already done it once, avoid doing it twice new_samples = audio_io.crop_samples(samples, sample_rate, start, end - start) new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate) yield create_example(example_id, new_ns, new_wav_data, velocity_range=velocity_range)