Ejemplo n.º 1
0
 def testFloatWavDataToSamples(self):
     y = audio_io.wav_data_to_samples(self.wav_data, sample_rate=16000)
     wav_io = six.BytesIO()
     scipy.io.wavfile.write(wav_io, 16000, y)
     y_from_float = audio_io.wav_data_to_samples(wav_io.getvalue(),
                                                 sample_rate=16000)
     np.testing.assert_array_equal(y, y_from_float)
Ejemplo n.º 2
0
 def testFloatWavDataToSamples(self):
   y = audio_io.wav_data_to_samples(self.wav_data, sample_rate=16000)
   wav_io = six.BytesIO()
   scipy.io.wavfile.write(wav_io, 16000, y)
   y_from_float = audio_io.wav_data_to_samples(
       wav_io.getvalue(), sample_rate=16000)
   np.testing.assert_array_equal(y, y_from_float)
Ejemplo n.º 3
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    window_length = 2048  # Model specific constant.

    input_details, output_details = tflite_model.get_model_detail(
        FLAGS.model_path)
    input_wav_length = input_details[0]['shape'][0]
    output_roll_length = output_details[0]['shape'][1]
    assert (input_wav_length - window_length) % (output_roll_length - 1) == 0
    hop_size = (input_wav_length - window_length) // (output_roll_length - 1)

    overlap_timesteps = 4
    overlap_wav = hop_size * overlap_timesteps + window_length

    results = multiprocessing.Queue()

    results_thread = threading.Thread(target=result_collector,
                                      args=(results, ))
    results_thread.start()

    if FLAGS.wav_file:
        model = tflite_model.Model(model_path=FLAGS.model_path)

        wav_data = tf.gfile.Open(FLAGS.wav_file, 'rb').read()
        samples = audio_io.wav_data_to_samples(wav_data, MODEL_SAMPLE_RATE)
        samples = samples[:MODEL_SAMPLE_RATE * 10]  # Only the first 10 seconds
        samples = samples.reshape((-1, 1))
        samples_length = samples.shape[0]
        # Extend samples with zeros
        samples = np.pad(samples, (0, input_wav_length), mode='constant')
        for i, pos in enumerate(
                range(0, samples_length - input_wav_length + overlap_wav,
                      input_wav_length - overlap_wav)):
            chunk = samples[pos:pos + input_wav_length]
            task = OnsetsTask(AudioChunk(i, chunk))
            task(model)
            results.put(task)
    else:
        tasks = multiprocessing.JoinableQueue()

        ## Make and start the workers
        num_workers = 4
        workers = [
            TfLiteWorker(FLAGS.model_path, tasks, results)
            for i in range(num_workers)
        ]
        for w in workers:
            w.start()

        audio_feeder = AudioQueue(
            callback=lambda audio_chunk: tasks.put(OnsetsTask(audio_chunk)),
            audio_device_index=FLAGS.mic
            if FLAGS.mic is None else int(FLAGS.mic),
            sample_rate_hz=int(FLAGS.sample_rate_hz),
            frame_length=input_wav_length,
            overlap=overlap_wav)

        audio_feeder.start()
Ejemplo n.º 4
0
def process_record(wav_data,
                   ns,
                   example_id,
                   min_length=5,
                   max_length=20,
                   sample_rate=16000,
                   allow_empty_notesequence=False):
  """Split a record into chunks and create an example proto.

  To use the full length audio and notesequence, set min_length=0 and
  max_length=-1.

  Args:
    wav_data: audio data in WAV format.
    ns: corresponding NoteSequence.
    example_id: id for the example proto
    min_length: minimum length in seconds for audio chunks.
    max_length: maximum length in seconds for audio chunks.
    sample_rate: desired audio sample rate.
    allow_empty_notesequence: whether an empty NoteSequence is allowed.

  Yields:
    Example protos.
  """
  samples = audio_io.wav_data_to_samples(wav_data, sample_rate)
  samples = librosa.util.normalize(samples, norm=np.inf)
  if max_length == min_length:
    splits = np.arange(0, ns.total_time, max_length)
  elif max_length > 0:
    splits = find_split_points(ns, samples, sample_rate, min_length, max_length)
  else:
    splits = [0, ns.total_time]
  velocities = [note.velocity for note in ns.notes]
  velocity_max = np.max(velocities) if velocities else 0
  velocity_min = np.min(velocities) if velocities else 0
  velocity_range = music_pb2.VelocityRange(min=velocity_min, max=velocity_max)

  for start, end in zip(splits[:-1], splits[1:]):
    if end - start < min_length:
      continue

    if start == 0 and end == ns.total_time:
      new_ns = ns
    else:
      new_ns = sequences_lib.extract_subsequence(ns, start, end)

    if not new_ns.notes and not allow_empty_notesequence:
      tf.logging.warning('skipping empty sequence')
      continue

    if start == 0 and end == ns.total_time:
      new_samples = samples
    else:
      # the resampling that happen in crop_wav_data is really slow
      # and we've already done it once, avoid doing it twice
      new_samples = audio_io.crop_samples(samples, sample_rate, start,
                                          end - start)
    new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate)
    yield create_example(
        example_id, new_ns, new_wav_data, velocity_range=velocity_range)
Ejemplo n.º 5
0
  def testWavDataToSamples(self):
    w = wave.open(self.wav_filename, 'rb')
    w_mono = wave.open(self.wav_filename_mono, 'rb')

    # Check content size.
    y = audio_io.wav_data_to_samples(self.wav_data, sample_rate=16000)
    y_mono = audio_io.wav_data_to_samples(self.wav_data_mono, sample_rate=22050)
    self.assertEquals(
        round(16000.0 * w.getnframes() / w.getframerate()), y.shape[0])
    self.assertEquals(
        round(22050.0 * w_mono.getnframes() / w_mono.getframerate()),
        y_mono.shape[0])

    # Check a few obvious failure modes.
    self.assertLess(0.01, y.std())
    self.assertLess(0.01, y_mono.std())
    self.assertGreater(-0.1, y.min())
    self.assertGreater(-0.1, y_mono.min())
    self.assertLess(0.1, y.max())
    self.assertLess(0.1, y_mono.max())
  def testWavDataToSamples(self):
    w = wave.open(self.wav_filename, 'rb')
    w_mono = wave.open(self.wav_filename_mono, 'rb')

    # Check content size.
    y = audio_io.wav_data_to_samples(self.wav_data, sample_rate=16000)
    y_mono = audio_io.wav_data_to_samples(self.wav_data_mono, sample_rate=22050)
    self.assertEquals(
        round(16000.0 * w.getnframes() / w.getframerate()), y.shape[0])
    self.assertEquals(
        round(22050.0 * w_mono.getnframes() / w_mono.getframerate()),
        y_mono.shape[0])

    # Check a few obvious failure modes.
    self.assertLess(0.01, y.std())
    self.assertLess(0.01, y_mono.std())
    self.assertGreater(-0.1, y.min())
    self.assertGreater(-0.1, y_mono.min())
    self.assertLess(0.1, y.max())
    self.assertLess(0.1, y_mono.max())
def generate_train_set():
  """Generate the train TFRecord."""
  train_file_pairs = []
  for directory in train_dirs:
    path = os.path.join(FLAGS.input_dir, directory)
    path = os.path.join(path, '*.wav')
    wav_files = glob.glob(path)
    # find matching mid files
    for wav_file in wav_files:
      base_name_root, _ = os.path.splitext(wav_file)
      mid_file = base_name_root + '.mid'
      train_file_pairs.append((wav_file, mid_file))

  train_output_name = os.path.join(FLAGS.output_dir,
                                   'maps_config2_train.tfrecord')

  with tf.python_io.TFRecordWriter(train_output_name) as writer:
    for pair in train_file_pairs:
      print(pair)
      # load the wav data
      wav_data = tf.gfile.Open(pair[0]).read()
      samples = audio_io.wav_data_to_samples(wav_data, FLAGS.sample_rate)

      # load the midi data and convert to a notesequence
      midi_data = tf.gfile.Open(pair[1]).read()
      ns = midi_io.midi_to_sequence_proto(midi_data)

      splits = find_split_points(ns, samples, FLAGS.sample_rate,
                                 FLAGS.min_length, FLAGS.max_length)

      for start, end in zip(splits[:-1], splits[1:]):
        if end - start < FLAGS.min_length:
          continue

        new_ns = sequences_lib.extract_subsequence(ns, start, end)
        new_wav_data = audio_io.crop_wav_data(wav_data, FLAGS.sample_rate,
                                              start, end - start)
        example = tf.train.Example(features=tf.train.Features(feature={
            'id':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[pair[0]]
                )),
            'sequence':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[new_ns.SerializeToString()]
                )),
            'audio':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[new_wav_data]
                ))

            }))
        writer.write(example.SerializeToString())
def generate_train_set():
    """Generate the train TFRecord."""
    train_file_pairs = []
    for directory in train_dirs:
        path = os.path.join(FLAGS.input_dir, directory)
        path = os.path.join(path, '*.wav')
        wav_files = glob.glob(path)
        # find matching mid files
        for wav_file in wav_files:
            base_name_root, _ = os.path.splitext(wav_file)
            mid_file = base_name_root + '.mid'
            train_file_pairs.append((wav_file, mid_file))

    train_output_name = os.path.join(FLAGS.output_dir,
                                     'maps_config2_train.tfrecord')

    with tf.python_io.TFRecordWriter(train_output_name) as writer:
        for pair in train_file_pairs:
            print(pair)
            # load the wav data
            wav_data = tf.gfile.Open(pair[0]).read()
            samples = audio_io.wav_data_to_samples(wav_data, FLAGS.sample_rate)

            # load the midi data and convert to a notesequence
            midi_data = tf.gfile.Open(pair[1]).read()
            ns = midi_io.midi_to_sequence_proto(midi_data)

            splits = find_split_points(ns, samples, FLAGS.sample_rate,
                                       FLAGS.min_length, FLAGS.max_length)

            for start, end in zip(splits[:-1], splits[1:]):
                if end - start < FLAGS.min_length:
                    continue

                new_ns = sequences_lib.extract_subsequence(ns, start, end)
                new_wav_data = audio_io.crop_wav_data(wav_data,
                                                      FLAGS.sample_rate, start,
                                                      end - start)
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'id':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[pair[0]])),
                        'sequence':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_ns.SerializeToString()])),
                        'audio':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_wav_data]))
                    }))
                writer.write(example.SerializeToString())
Ejemplo n.º 9
0
def _wav_to_mel(wav_audio, hparams):
    """Transforms the contents of a wav file into a series of mel spec frames."""
    y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate)

    mel = librosa.feature.melspectrogram(y,
                                         hparams.sample_rate,
                                         hop_length=hparams.spec_hop_length,
                                         fmin=hparams.spec_fmin,
                                         n_mels=hparams.spec_n_bins).astype(
                                             np.float32)

    # Transpose so that the data is in [frame, bins] format.
    mel = mel.T
    return mel
Ejemplo n.º 10
0
def _wav_to_mel(wav_audio, hparams):
  """Transforms the contents of a wav file into a series of mel spec frames."""
  y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate)

  mel = librosa.feature.melspectrogram(
      y,
      hparams.sample_rate,
      hop_length=hparams.spec_hop_length,
      fmin=hparams.spec_fmin,
      n_mels=hparams.spec_n_bins).astype(np.float32)

  # Transpose so that the data is in [frame, bins] format.
  mel = mel.T
  return mel
Ejemplo n.º 11
0
def _wav_to_cqt(wav_audio, hparams):
    """Transforms the contents of a wav file into a series of CQT frames."""
    y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate)

    cqt = np.abs(librosa.core.cqt(y,
                                  hparams.sample_rate,
                                  hop_length=hparams.spec_hop_length,
                                  fmin=hparams.spec_fmin,
                                  n_bins=hparams.spec_n_bins,
                                  bins_per_octave=hparams.cqt_bins_per_octave),
                 dtype=np.float32)

    # Transpose so that the data is in [frame, bins] format.
    cqt = cqt.T
    return cqt
Ejemplo n.º 12
0
def _wav_to_framed_samples(wav_audio, hparams):
  """Transforms the contents of a wav file into a series of framed samples."""
  y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate)

  hl = hparams.spec_hop_length
  n_frames = int(np.ceil(y.shape[0] / hl))
  frames = np.zeros((n_frames, hl), dtype=np.float32)

  # Fill in everything but the last frame which may not be the full length
  cutoff = (n_frames - 1) * hl
  frames[:n_frames - 1, :] = np.reshape(y[:cutoff], (n_frames - 1, hl))
  # Fill the last frame
  remain_len = len(y[cutoff:])
  frames[n_frames - 1, :remain_len] = y[cutoff:]

  return frames
Ejemplo n.º 13
0
def _wav_to_framed_samples(wav_audio, hparams):
  """Transforms the contents of a wav file into a series of framed samples."""
  y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate)

  hl = hparams.spec_hop_length
  n_frames = int(np.ceil(y.shape[0] / hl))
  frames = np.zeros((n_frames, hl), dtype=np.float32)

  # Fill in everything but the last frame which may not be the full length
  cutoff = (n_frames - 1) * hl
  frames[:n_frames - 1, :] = np.reshape(y[:cutoff], (n_frames - 1, hl))
  # Fill the last frame
  remain_len = len(y[cutoff:])
  frames[n_frames - 1, :remain_len] = y[cutoff:]

  return frames
Ejemplo n.º 14
0
def _wav_to_cqt(wav_audio, hparams):
  """Transforms the contents of a wav file into a series of CQT frames."""
  y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate)

  cqt = np.abs(
      librosa.core.cqt(
          y,
          hparams.sample_rate,
          hop_length=hparams.spec_hop_length,
          fmin=hparams.spec_fmin,
          n_bins=hparams.spec_n_bins,
          bins_per_octave=hparams.cqt_bins_per_octave),
      dtype=np.float32)

  # Transpose so that the data is in [frame, bins] format.
  cqt = cqt.T
  return cqt
Ejemplo n.º 15
0
def mix_examples(mixid_exs, sample_rate, load_audio_with_librosa):
    """Mix several Examples together to create a new example."""
    mixid, exs = mixid_exs
    del mixid

    example_samples = []
    example_sequences = []

    for ex_str in exs:
        ex = tf.train.Example.FromString(ex_str)
        wav_data = ex.features.feature['audio'].bytes_list.value[0]
        if load_audio_with_librosa:
            samples = audio_io.wav_data_to_samples_librosa(
                wav_data, sample_rate)
        else:
            samples = audio_io.wav_data_to_samples(wav_data, sample_rate)
        example_samples.append(samples)
        ns = music_pb2.NoteSequence.FromString(
            ex.features.feature['sequence'].bytes_list.value[0])
        example_sequences.append(ns)

    mixed_samples, mixed_sequence = audio_label_data_utils.mix_sequences(
        individual_samples=example_samples,
        sample_rate=sample_rate,
        individual_sequences=example_sequences)

    mixed_wav_data = audio_io.samples_to_wav_data(mixed_samples, sample_rate)

    mixed_id = '::'.join(['mixed'] + [ns.id for ns in example_sequences])
    mixed_sequence.id = mixed_id
    mixed_filename = '::'.join(['mixed'] +
                               [ns.filename for ns in example_sequences])
    mixed_sequence.filename = mixed_filename

    examples = list(
        audio_label_data_utils.process_record(mixed_wav_data,
                                              mixed_sequence,
                                              mixed_id,
                                              min_length=0,
                                              max_length=-1,
                                              sample_rate=sample_rate))
    assert len(examples) == 1
    return examples[0]
Ejemplo n.º 16
0
def create_timbre_spectrogram(audio, hparams):
    """Create either a CQT or mel spectrogram"""
    if tf.is_tensor(audio):
        audio = audio.numpy()
    if isinstance(audio, bytes):
        # Get samples from wav data.
        samples = audio_io.wav_data_to_samples(audio, hparams.sample_rate)
    else:
        samples = audio

    if hparams.timbre_spec_type == 'mel':
        spec = np.abs(
            librosa.feature.melspectrogram(
                samples,
                hparams.sample_rate,
                hop_length=hparams.timbre_hop_length,
                fmin=librosa.midi_to_hz(constants.MIN_TIMBRE_PITCH),
                fmax=librosa.midi_to_hz(constants.MAX_TIMBRE_PITCH),
                n_mels=constants.TIMBRE_SPEC_BANDS,
                pad_mode='symmetric',
                htk=hparams.spec_mel_htk,
                power=2)).T

    else:
        spec = np.abs(
            librosa.core.cqt(samples,
                             hparams.sample_rate,
                             hop_length=hparams.timbre_hop_length,
                             fmin=librosa.midi_to_hz(
                                 constants.MIN_TIMBRE_PITCH),
                             n_bins=constants.TIMBRE_SPEC_BANDS,
                             bins_per_octave=constants.BINS_PER_OCTAVE,
                             pad_mode='symmetric')).T

    # convert amplitude to power
    if hparams.timbre_spec_log_amplitude:
        spec = librosa.power_to_db(spec) - librosa.power_to_db(np.array([1e-9
                                                                         ]))[0]
        spec = spec / np.max(spec)
    return spec
    def testSplitAudioLabelData(self):
        wav_data, sequence = self._CreateSyntheticExample()
        records = audio_label_data_utils.process_record(
            wav_data, sequence, 'test', sample_rate=SAMPLE_RATE)

        for record in records:
            audio = record.features.feature['audio'].bytes_list.value[0]
            velocity_range = music_pb2.VelocityRange.FromString(
                record.features.feature['velocity_range'].bytes_list.value[0])
            note_sequence = music_pb2.NoteSequence.FromString(
                record.features.feature['sequence'].bytes_list.value[0])

            expected_samples = np.zeros(10 * SAMPLE_RATE)
            np.testing.assert_array_equal(
                expected_samples,
                audio_io.wav_data_to_samples(audio, sample_rate=SAMPLE_RATE))
            self.assertEqual(velocity_range.min, 20)
            self.assertEqual(velocity_range.max, 80)
            self.assertEqual(note_sequence.notes[0].velocity, 20)
            self.assertEqual(note_sequence.notes[0].end_time, 5.)
            self.assertEqual(note_sequence.notes[1].velocity, 80)
            self.assertEqual(note_sequence.notes[1].end_time, 10.)
  def testSplitAudioLabelData(self):
    wav_data, sequence = self._CreateSyntheticExample()
    records = split_audio_and_label_data.process_record(
        wav_data, sequence, 'test', sample_rate=SAMPLE_RATE)

    for record in records:
      audio = record.features.feature['audio'].bytes_list.value[0]
      velocity_range = music_pb2.VelocityRange.FromString(
          record.features.feature['velocity_range'].bytes_list.value[0])
      note_sequence = music_pb2.NoteSequence.FromString(
          record.features.feature['sequence'].bytes_list.value[0])

      self.assertEqual(
          np.all(
              audio_io.wav_data_to_samples(audio, sample_rate=SAMPLE_RATE) ==
              np.zeros(2 * SAMPLE_RATE)), True)
      self.assertEqual(velocity_range.min, 20)
      self.assertEqual(velocity_range.max, 80)
      self.assertEqual(note_sequence.notes[0].velocity, 20)
      self.assertEqual(note_sequence.notes[0].end_time, 5.)
      self.assertEqual(note_sequence.notes[1].velocity, 80)
      self.assertEqual(note_sequence.notes[1].end_time, 10.)
Ejemplo n.º 19
0
def process_record(wav_data,
                   ns,
                   example_id,
                   min_length=5,
                   max_length=20,
                   sample_rate=16000,
                   allow_empty_notesequence=False,
                   load_audio_with_librosa=False):
    """Split a record into chunks and create an example proto.

  To use the full length audio and notesequence, set min_length=0 and
  max_length=-1.

  Args:
    wav_data: audio data in WAV format.
    ns: corresponding NoteSequence.
    example_id: id for the example proto
    min_length: minimum length in seconds for audio chunks.
    max_length: maximum length in seconds for audio chunks.
    sample_rate: desired audio sample rate.
    allow_empty_notesequence: whether an empty NoteSequence is allowed.
    load_audio_with_librosa: Use librosa for sampling. Works with 24-bit wavs.

  Yields:
    Example protos.
  """
    try:
        if load_audio_with_librosa:
            samples = audio_io.wav_data_to_samples_librosa(
                wav_data, sample_rate)
        else:
            samples = audio_io.wav_data_to_samples(wav_data, sample_rate)
    except audio_io.AudioIOReadError as e:
        print('Exception %s', e)
        return
    samples = librosa.util.normalize(samples, norm=np.inf)

    # Add padding to samples if notesequence is longer.
    pad_to_samples = int(math.ceil(ns.total_time * sample_rate))
    padding_needed = pad_to_samples - samples.shape[0]
    if padding_needed > 5 * sample_rate:
        raise ValueError(
            'Would have padded {} more than 5 seconds to match note sequence total '
            'time. ({} original samples, {} sample rate, {} sample seconds, '
            '{} sequence seconds) This likely indicates a problem with the source '
            'data.'.format(example_id, samples.shape[0], sample_rate,
                           samples.shape[0] / sample_rate, ns.total_time))
    samples = np.pad(samples, (0, max(0, padding_needed)), 'constant')

    if max_length == min_length:
        splits = np.arange(0, ns.total_time, max_length)
    elif max_length > 0:
        splits = find_split_points(ns, samples, sample_rate, min_length,
                                   max_length)
    else:
        splits = [0, ns.total_time]
    velocities = [note.velocity for note in ns.notes]
    velocity_max = np.max(velocities) if velocities else 0
    velocity_min = np.min(velocities) if velocities else 0
    velocity_range = music_pb2.VelocityRange(min=velocity_min,
                                             max=velocity_max)

    for start, end in zip(splits[:-1], splits[1:]):
        if end - start < min_length:
            continue

        if start == 0 and end == ns.total_time:
            new_ns = ns
        else:
            new_ns = sequences_lib.extract_subsequence(ns, start, end)

        if not new_ns.notes and not allow_empty_notesequence:
            tf.logging.warning('skipping empty sequence')
            continue

        if start == 0 and end == ns.total_time:
            new_samples = samples
        else:
            # the resampling that happen in crop_wav_data is really slow
            # and we've already done it once, avoid doing it twice
            new_samples = audio_io.crop_samples(samples, sample_rate, start,
                                                end - start)
        new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate)
        yield create_example(example_id,
                             new_ns,
                             new_wav_data,
                             velocity_range=velocity_range)
Ejemplo n.º 20
0
def _wav_to_cqt(wav_audio, hparams):
    """Transforms the contents of a wav file into a series of CQT frames."""
    y = audio_io.wav_data_to_samples(wav_audio, hparams.sample_rate)

    return samples_to_cqt(y, hparams)
def generate_train_set(exclude_ids):
  """Generate the train TFRecord."""
  train_file_pairs = []
  for directory in train_dirs:
    path = os.path.join(FLAGS.input_dir, directory)
    path = os.path.join(path, '*.wav')
    wav_files = glob.glob(path)
    # find matching mid files
    for wav_file in wav_files:
      base_name_root, _ = os.path.splitext(wav_file)
      mid_file = base_name_root + '.mid'
      if filename_to_id(wav_file) not in exclude_ids:
        train_file_pairs.append((wav_file, mid_file))

  train_output_name = os.path.join(FLAGS.output_dir,
                                   'maps_config2_train.tfrecord')

  with tf.python_io.TFRecordWriter(train_output_name) as writer:
    for pair in train_file_pairs:
      print(pair)
      # load the wav data
      wav_data = tf.gfile.Open(pair[0], 'rb').read()
      samples = audio_io.wav_data_to_samples(wav_data, FLAGS.sample_rate)
      samples = librosa.util.normalize(samples, norm=np.inf)

      # load the midi data and convert to a notesequence
      ns = midi_io.midi_file_to_note_sequence(pair[1])

      splits = find_split_points(ns, samples, FLAGS.sample_rate,
                                 FLAGS.min_length, FLAGS.max_length)

      velocities = [note.velocity for note in ns.notes]
      velocity_max = np.max(velocities)
      velocity_min = np.min(velocities)
      new_velocity_tuple = music_pb2.VelocityRange(
          min=velocity_min, max=velocity_max)

      for start, end in zip(splits[:-1], splits[1:]):
        if end - start < FLAGS.min_length:
          continue

        new_ns = sequences_lib.extract_subsequence(ns, start, end)
        new_wav_data = audio_io.crop_wav_data(wav_data, FLAGS.sample_rate,
                                              start, end - start)
        example = tf.train.Example(features=tf.train.Features(feature={
            'id':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[pair[0]]
                )),
            'sequence':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[new_ns.SerializeToString()]
                )),
            'audio':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[new_wav_data]
                )),
            'velocity_range':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[new_velocity_tuple.SerializeToString()]
                )),
            }))
        writer.write(example.SerializeToString())
def generate_train_set(exclude_ids):
    """Generate the train TFRecord."""
    train_file_pairs = []
    for directory in train_dirs:
        path = os.path.join(FLAGS.input_dir, directory)
        path = os.path.join(path, '*.wav')
        wav_files = glob.glob(path)
        # find matching mid files
        for wav_file in wav_files:
            base_name_root, _ = os.path.splitext(wav_file)
            mid_file = base_name_root + '.mid'
            if filename_to_id(wav_file) not in exclude_ids:
                train_file_pairs.append((wav_file, mid_file))

    train_output_name = os.path.join(FLAGS.output_dir,
                                     'maps_config2_train.tfrecord')

    with tf.python_io.TFRecordWriter(train_output_name) as writer:
        for pair in train_file_pairs:
            print(pair)
            # load the wav data
            wav_data = tf.gfile.Open(pair[0], 'rb').read()
            samples = audio_io.wav_data_to_samples(wav_data, FLAGS.sample_rate)
            samples = librosa.util.normalize(samples, norm=np.inf)

            # load the midi data and convert to a notesequence
            ns = midi_io.midi_file_to_note_sequence(pair[1])

            splits = find_split_points(ns, samples, FLAGS.sample_rate,
                                       FLAGS.min_length, FLAGS.max_length)

            velocities = [note.velocity for note in ns.notes]
            velocity_max = np.max(velocities)
            velocity_min = np.min(velocities)
            new_velocity_tuple = music_pb2.VelocityRange(min=velocity_min,
                                                         max=velocity_max)

            for start, end in zip(splits[:-1], splits[1:]):
                if end - start < FLAGS.min_length:
                    continue

                new_ns = sequences_lib.extract_subsequence(ns, start, end)
                new_wav_data = audio_io.crop_wav_data(wav_data,
                                                      FLAGS.sample_rate, start,
                                                      end - start)
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'id':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[pair[0]])),
                        'sequence':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_ns.SerializeToString()])),
                        'audio':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_wav_data])),
                        'velocity_range':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_velocity_tuple.SerializeToString()])),
                    }))
                writer.write(example.SerializeToString())