def process_record(wav_data,
                   ns,
                   example_id,
                   min_length=5,
                   max_length=20,
                   sample_rate=16000,
                   allow_empty_notesequence=False):
  """Split a record into chunks and create an example proto.

  To use the full length audio and notesequence, set min_length=0 and
  max_length=-1.

  Args:
    wav_data: audio data in WAV format.
    ns: corresponding NoteSequence.
    example_id: id for the example proto
    min_length: minimum length in seconds for audio chunks.
    max_length: maximum length in seconds for audio chunks.
    sample_rate: desired audio sample rate.
    allow_empty_notesequence: whether an empty NoteSequence is allowed.

  Yields:
    Example protos.
  """
  samples = audio_io.wav_data_to_samples(wav_data, sample_rate)
  samples = librosa.util.normalize(samples, norm=np.inf)
  if max_length == min_length:
    splits = np.arange(0, ns.total_time, max_length)
  elif max_length > 0:
    splits = find_split_points(ns, samples, sample_rate, min_length, max_length)
  else:
    splits = [0, ns.total_time]
  velocities = [note.velocity for note in ns.notes]
  velocity_max = np.max(velocities) if velocities else 0
  velocity_min = np.min(velocities) if velocities else 0
  velocity_range = music_pb2.VelocityRange(min=velocity_min, max=velocity_max)

  for start, end in zip(splits[:-1], splits[1:]):
    if end - start < min_length:
      continue

    if start == 0 and end == ns.total_time:
      new_ns = ns
    else:
      new_ns = sequences_lib.extract_subsequence(ns, start, end)

    if not new_ns.notes and not allow_empty_notesequence:
      tf.logging.warning('skipping empty sequence')
      continue

    if start == 0 and end == ns.total_time:
      new_samples = samples
    else:
      # the resampling that happen in crop_wav_data is really slow
      # and we've already done it once, avoid doing it twice
      new_samples = audio_io.crop_samples(samples, sample_rate, start,
                                          end - start)
    new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate)
    yield create_example(
        example_id, new_ns, new_wav_data, velocity_range=velocity_range)
def generate_test_set():
  """Generate the test TFRecord."""
  test_file_pairs = []
  for directory in test_dirs:
    path = os.path.join(FLAGS.input_dir, directory)
    path = os.path.join(path, '*.wav')
    wav_files = glob.glob(path)
    # find matching mid files
    for wav_file in wav_files:
      base_name_root, _ = os.path.splitext(wav_file)
      mid_file = base_name_root + '.mid'
      test_file_pairs.append((wav_file, mid_file))

  test_output_name = os.path.join(FLAGS.output_dir,
                                  'maps_config2_test.tfrecord')

  with tf.python_io.TFRecordWriter(test_output_name) as writer:
    for idx, pair in enumerate(test_file_pairs):
      print('{} of {}: {}'.format(idx, len(test_file_pairs), pair[0]))
      # load the wav data and resample it.
      samples = audio_io.load_audio(pair[0], FLAGS.sample_rate)
      wav_data = audio_io.samples_to_wav_data(samples, FLAGS.sample_rate)

      # load the midi data and convert to a notesequence
      ns = midi_io.midi_file_to_note_sequence(pair[1])

      example = split_audio_and_label_data.create_example(pair[0], ns, wav_data)
      writer.write(example.SerializeToString())

  return [filename_to_id(wav) for wav, _ in test_file_pairs]
Esempio n. 3
0
  def _ValidateProvideBatchMemory(self,
                                  truncated_length,
                                  batch_size,
                                  lengths,
                                  expected_num_inputs):
    hparams = copy.deepcopy(constants.DEFAULT_HPARAMS)
    examples = []
    expected_inputs = []

    for i, length in enumerate(lengths):
      wav_samples = np.zeros(
          (np.int((length / data.hparams_frames_per_second(hparams)) *
                  hparams.sample_rate), 1), np.float32)
      wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate)

      num_frames = data.wav_to_num_frames(
          wav_data, frames_per_second=data.hparams_frames_per_second(hparams))

      seq = self._SyntheticSequence(
          num_frames / data.hparams_frames_per_second(hparams),
          i + constants.MIN_MIDI_PITCH)

      examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
      expected_inputs += self._ExampleToInputs(
          examples[-1],
          truncated_length)
    self.assertEqual(expected_num_inputs, len(expected_inputs))

    self._ValidateProvideBatch(
        [e.SerializeToString() for e in examples],
        truncated_length,
        batch_size,
        expected_inputs)
Esempio n. 4
0
  def _CreateExamplesAndExpectedInputs(self,
                                       truncated_length,
                                       lengths,
                                       expected_num_inputs):
    hparams = copy.deepcopy(configs.DEFAULT_HPARAMS)
    examples = []
    expected_inputs = []

    for i, length in enumerate(lengths):
      wav_samples = np.zeros(
          (np.int((length / data.hparams_frames_per_second(hparams)) *
                  hparams.sample_rate), 1), np.float32)
      wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate)

      num_frames = data.wav_to_num_frames(
          wav_data, frames_per_second=data.hparams_frames_per_second(hparams))

      seq = self._SyntheticSequence(
          num_frames / data.hparams_frames_per_second(hparams),
          i + constants.MIN_MIDI_PITCH)

      examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
      expected_inputs += self._ExampleToInputs(
          examples[-1],
          truncated_length)
    self.assertEqual(expected_num_inputs, len(expected_inputs))
    return examples, expected_inputs
def create_example(filename, hparams):
  """Processes an audio file into an Example proto."""
  wav_data = librosa.core.load(filename, sr=hparams.sample_rate)[0]
  if hparams.normalize_audio:
    audio_io.normalize_wav_data(wav_data, hparams.sample_rate)
  wav_data = audio_io.samples_to_wav_data(wav_data, hparams.sample_rate)

  example = tf.train.Example(features=tf.train.Features(feature={
      'id':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[filename.encode('utf-8')]
          )),
      'sequence':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[music_pb2.NoteSequence().SerializeToString()]
          )),
      'audio':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[wav_data]
          )),
      'velocity_range':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[music_pb2.VelocityRange().SerializeToString()]
          )),
  }))

  return example.SerializeToString()
Esempio n. 6
0
    def create_example(self, filename, hparams):
        """Processes an audio file into an Example proto."""
        wav_data = librosa.core.load(filename, sr=hparams.sample_rate)[0]

        if hparams.normalize_audio:
            audio_io.normalize_wav_data(wav_data, hparams.sample_rate)
        wav_data = audio_io.samples_to_wav_data(wav_data, hparams.sample_rate)

        example = tf.train.Example(features=tf.train.Features(
            feature={
                'id':
                tf.train.Feature(bytes_list=tf.train.BytesList(
                    value=[filename.encode('utf-8')])),
                'sequence':
                tf.train.Feature(bytes_list=tf.train.BytesList(
                    value=[music_pb2.NoteSequence().SerializeToString()])),
                'audio':
                tf.train.Feature(bytes_list=tf.train.BytesList(
                    value=[wav_data])),
                'velocity_range':
                tf.train.Feature(bytes_list=tf.train.BytesList(
                    value=[music_pb2.VelocityRange().SerializeToString()])),
            }))

        return example.SerializeToString()
Esempio n. 7
0
    def _ValidateProvideBatchMemory(self, truncated_length, batch_size,
                                    lengths, expected_num_inputs):
        hparams = copy.deepcopy(constants.DEFAULT_HPARAMS)
        examples = []
        expected_inputs = []

        for i, length in enumerate(lengths):
            wav_samples = np.zeros(
                (np.int((length / data.hparams_frames_per_second(hparams)) *
                        hparams.sample_rate), 1), np.float32)
            wav_data = audio_io.samples_to_wav_data(wav_samples,
                                                    hparams.sample_rate)

            num_frames = data.wav_to_num_frames(
                wav_data,
                frames_per_second=data.hparams_frames_per_second(hparams))

            seq = self._SyntheticSequence(
                num_frames / data.hparams_frames_per_second(hparams),
                i + constants.MIN_MIDI_PITCH)

            examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
            expected_inputs += self._ExampleToInputs(examples[-1],
                                                     truncated_length)
        self.assertEqual(expected_num_inputs, len(expected_inputs))

        self._ValidateProvideBatch([e.SerializeToString() for e in examples],
                                   truncated_length, batch_size,
                                   expected_inputs)
Esempio n. 8
0
    def validateProvideBatch_TFRecord(self, truncated_length, batch_size,
                                      lengths, expected_num_inputs):
        hparams = copy.deepcopy(constants.DEFAULT_HPARAMS)
        examples = []
        expected_inputs = []

        for i, length in enumerate(lengths):
            wav_samples = np.zeros(
                (np.int((length / data.hparams_frames_per_second(hparams)) *
                        constants.DEFAULT_SAMPLE_RATE), 1), np.float32)
            wav_data = audio_io.samples_to_wav_data(
                wav_samples, constants.DEFAULT_SAMPLE_RATE)

            num_frames = data.wav_to_num_frames(
                wav_data,
                frames_per_second=data.hparams_frames_per_second(hparams))

            seq = self._SyntheticSequence(
                num_frames / data.hparams_frames_per_second(hparams),
                i + constants.MIN_MIDI_PITCH)

            examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
            expected_inputs += self._ExampleToInputs(examples[-1],
                                                     truncated_length)
        self.assertEqual(expected_num_inputs, len(expected_inputs))

        with tempfile.NamedTemporaryFile() as temp_rio:
            with tf.python_io.TFRecordWriter(temp_rio.name) as writer:
                for ex in examples:
                    writer.write(ex.SerializeToString())

            self.validateProvideBatch(temp_rio.name, truncated_length,
                                      batch_size, expected_inputs)
Esempio n. 9
0
  def _CreateExamplesAndExpectedInputs(self,
                                       truncated_length,
                                       lengths,
                                       expected_num_inputs):
    hparams = copy.deepcopy(configs.DEFAULT_HPARAMS)
    examples = []
    expected_inputs = []

    for i, length in enumerate(lengths):
      wav_samples = np.zeros(
          (np.int((length / data.hparams_frames_per_second(hparams)) *
                  hparams.sample_rate), 1), np.float32)
      wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate)

      num_frames = data.wav_to_num_frames(
          wav_data, frames_per_second=data.hparams_frames_per_second(hparams))

      seq = self._SyntheticSequence(
          num_frames / data.hparams_frames_per_second(hparams),
          i + constants.MIN_MIDI_PITCH)

      examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
      expected_inputs += self._ExampleToInputs(
          examples[-1],
          truncated_length)
    self.assertEqual(expected_num_inputs, len(expected_inputs))
    return examples, expected_inputs
def generate_test_set():
    """Generate the test TFRecord."""
    test_file_pairs = []
    for directory in test_dirs:
        path = os.path.join(FLAGS.input_dir, directory)
        path = os.path.join(path, '*.wav')
        wav_files = glob.glob(path)
        # find matching mid files
        for wav_file in wav_files:
            base_name_root, _ = os.path.splitext(wav_file)
            mid_file = base_name_root + '.mid'
            test_file_pairs.append((wav_file, mid_file))

    test_output_name = os.path.join(FLAGS.output_dir,
                                    'maps_config2_test.tfrecord')

    with tf.python_io.TFRecordWriter(test_output_name) as writer:
        for idx, pair in enumerate(test_file_pairs):
            print('{} of {}: {}'.format(idx, len(test_file_pairs), pair[0]))
            # load the wav data and resample it.
            samples = audio_io.load_audio(pair[0], FLAGS.sample_rate)
            wav_data = audio_io.samples_to_wav_data(samples, FLAGS.sample_rate)

            # load the midi data and convert to a notesequence
            ns = midi_io.midi_file_to_note_sequence(pair[1])

            example = audio_label_data_utils.create_example(
                pair[0], ns, wav_data)
            writer.write(example.SerializeToString())

    return [filename_to_id(wav) for wav, _ in test_file_pairs]
Esempio n. 11
0
def generate_test_set():
  """Generate the test TFRecord."""
  test_file_pairs = []
  for directory in test_dirs:
    path = os.path.join(FLAGS.input_dir, directory)
    path = os.path.join(path, '*.wav')
    wav_files = glob.glob(path)
    # find matching mid files
    for wav_file in wav_files:
      base_name_root, _ = os.path.splitext(wav_file)
      mid_file = base_name_root + '.mid'
      test_file_pairs.append((wav_file, mid_file))

  test_output_name = os.path.join(FLAGS.output_dir,
                                  'maps_config2_test.tfrecord')

  with tf.python_io.TFRecordWriter(test_output_name) as writer:
    for pair in test_file_pairs:
      print(pair)
      # load the wav data and resample it.
      samples = audio_io.load_audio(pair[0], FLAGS.sample_rate)
      wav_data = audio_io.samples_to_wav_data(samples, FLAGS.sample_rate)

      # load the midi data and convert to a notesequence
      midi_data = tf.gfile.Open(pair[1]).read()
      ns = midi_io.midi_to_sequence_proto(midi_data)

      velocities = [note.velocity for note in ns.notes]
      velocity_max = np.max(velocities)
      velocity_min = np.min(velocities)
      new_velocity_tuple = music_pb2.VelocityRange(
          min=velocity_min, max=velocity_max)

      example = tf.train.Example(features=tf.train.Features(feature={
          'id':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[pair[0]]
              )),
          'sequence':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[ns.SerializeToString()]
              )),
          'audio':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[wav_data]
              )),
          'velocity_range':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[new_velocity_tuple.SerializeToString()]
              )),
          }))
      writer.write(example.SerializeToString())

  return [filename_to_id(wav) for wav, _ in test_file_pairs]
def generate_test_set():
  """Generate the test TFRecord."""
  test_file_pairs = []
  for directory in test_dirs:
    path = os.path.join(FLAGS.input_dir, directory)
    path = os.path.join(path, '*.wav')
    wav_files = glob.glob(path)
    # find matching mid files
    for wav_file in wav_files:
      base_name_root, _ = os.path.splitext(wav_file)
      mid_file = base_name_root + '.mid'
      test_file_pairs.append((wav_file, mid_file))

  test_output_name = os.path.join(FLAGS.output_dir,
                                  'maps_config2_test.tfrecord')

  with tf.python_io.TFRecordWriter(test_output_name) as writer:
    for pair in test_file_pairs:
      print(pair)
      # load the wav data and resample it.
      samples = audio_io.load_audio(pair[0], FLAGS.sample_rate)
      wav_data = audio_io.samples_to_wav_data(samples, FLAGS.sample_rate)

      # load the midi data and convert to a notesequence
      midi_data = tf.gfile.Open(pair[1]).read()
      ns = midi_io.midi_to_sequence_proto(midi_data)

      velocities = [note.velocity for note in ns.notes]
      velocity_max = np.max(velocities)
      velocity_min = np.min(velocities)
      new_velocity_tuple = music_pb2.VelocityRange(
          min=velocity_min, max=velocity_max)

      example = tf.train.Example(features=tf.train.Features(feature={
          'id':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[pair[0]]
              )),
          'sequence':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[ns.SerializeToString()]
              )),
          'audio':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[wav_data]
              )),
          'velocity_range':
          tf.train.Feature(bytes_list=tf.train.BytesList(
              value=[new_velocity_tuple.SerializeToString()]
              )),
          }))
      writer.write(example.SerializeToString())

  return [filename_to_id(wav) for wav, _ in test_file_pairs]
Esempio n. 13
0
    def process(self, paths):
        wav_path, midi_path = paths

        if midi_path:
            if FLAGS.use_midi_stems:
                base_ns = note_sequence_from_directory(
                    os.path.dirname(midi_path))
            else:
                base_ns = midi_io.midi_file_to_note_sequence(midi_path)
            base_ns.filename = midi_path
        else:
            base_ns = music_pb2.NoteSequence()

        logging.info('Creating Example %s:%s', midi_path, wav_path)
        if FLAGS.convert_flac:
            samples, sr = librosa.load(wav_path, FLAGS.sample_rate)
            wav_data = audio_io.samples_to_wav_data(samples, sr)
        else:
            wav_data = tf.io.gfile.GFile(wav_path, 'rb').read()

        ns = copy.deepcopy(base_ns)

        # Use base names.
        ns.id = '%s:%s' % (wav_path, midi_path)

        Metrics.counter('create_example', 'read_midi_wav').inc()

        if FLAGS.max_length > 0:
            split_examples = audio_label_data_utils.process_record(
                wav_data,
                ns,
                ns.id,
                min_length=FLAGS.min_length,
                max_length=FLAGS.max_length,
                sample_rate=FLAGS.sample_rate,
                load_audio_with_librosa=False)

            for example in split_examples:
                Metrics.counter('split_wav', 'split_example').inc()
                yield example
        else:

            example = audio_label_data_utils.create_example(
                ns.id, ns, wav_data)

            Metrics.counter('create_example', 'created_example').inc()
            yield example
Esempio n. 14
0
    def _ValidateProvideBatchTFRecord(self,
                                      truncated_length,
                                      batch_size,
                                      lengths,
                                      expected_num_inputs,
                                      crop_sequence_secs=0):
        hparams = copy.deepcopy(constants.DEFAULT_HPARAMS)
        examples = []
        expected_inputs = []

        for i, length in enumerate(lengths):
            wav_samples = np.zeros(
                (np.int((length / data.hparams_frames_per_second(hparams)) *
                        hparams.sample_rate), 1), np.float32)
            wav_data = audio_io.samples_to_wav_data(wav_samples,
                                                    hparams.sample_rate)

            num_frames = data.wav_to_num_frames(
                wav_data,
                frames_per_second=data.hparams_frames_per_second(hparams))

            seq = self._SyntheticSequence(
                num_frames / data.hparams_frames_per_second(hparams) -
                crop_sequence_secs * 2,  # crop from both ends.
                i + constants.MIN_MIDI_PITCH,
                start_time=crop_sequence_secs)

            examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
            expected_inputs += self._ExampleToInputs(
                examples[-1],
                truncated_length,
                crop_training_sequence_to_notes=crop_sequence_secs > 0)
        self.assertEqual(expected_num_inputs, len(expected_inputs))

        with tempfile.NamedTemporaryFile() as temp_tfr:
            with tf.python_io.TFRecordWriter(temp_tfr.name) as writer:
                for ex in examples:
                    writer.write(ex.SerializeToString())

            self._ValidateProvideBatch(
                temp_tfr.name,
                truncated_length,
                batch_size,
                expected_inputs,
                crop_training_sequence_to_notes=crop_sequence_secs > 0)
Esempio n. 15
0
def mix_examples(mixid_exs, sample_rate, load_audio_with_librosa):
    """Mix several Examples together to create a new example."""
    mixid, exs = mixid_exs
    del mixid

    example_samples = []
    example_sequences = []

    for ex_str in exs:
        ex = tf.train.Example.FromString(ex_str)
        wav_data = ex.features.feature['audio'].bytes_list.value[0]
        if load_audio_with_librosa:
            samples = audio_io.wav_data_to_samples_librosa(
                wav_data, sample_rate)
        else:
            samples = audio_io.wav_data_to_samples(wav_data, sample_rate)
        example_samples.append(samples)
        ns = music_pb2.NoteSequence.FromString(
            ex.features.feature['sequence'].bytes_list.value[0])
        example_sequences.append(ns)

    mixed_samples, mixed_sequence = audio_label_data_utils.mix_sequences(
        individual_samples=example_samples,
        sample_rate=sample_rate,
        individual_sequences=example_sequences)

    mixed_wav_data = audio_io.samples_to_wav_data(mixed_samples, sample_rate)

    mixed_id = '::'.join(['mixed'] + [ns.id for ns in example_sequences])
    mixed_sequence.id = mixed_id
    mixed_filename = '::'.join(['mixed'] +
                               [ns.filename for ns in example_sequences])
    mixed_sequence.filename = mixed_filename

    examples = list(
        audio_label_data_utils.process_record(mixed_wav_data,
                                              mixed_sequence,
                                              mixed_id,
                                              min_length=0,
                                              max_length=-1,
                                              sample_rate=sample_rate))
    assert len(examples) == 1
    return examples[0]
Esempio n. 16
0
  def _ValidateProvideBatchTFRecord(self,
                                    truncated_length,
                                    batch_size,
                                    lengths,
                                    expected_num_inputs,
                                    crop_sequence_secs=0):
    hparams = copy.deepcopy(constants.DEFAULT_HPARAMS)
    examples = []
    expected_inputs = []

    for i, length in enumerate(lengths):
      wav_samples = np.zeros(
          (np.int((length / data.hparams_frames_per_second(hparams)) *
                  hparams.sample_rate), 1), np.float32)
      wav_data = audio_io.samples_to_wav_data(wav_samples, hparams.sample_rate)

      num_frames = data.wav_to_num_frames(
          wav_data, frames_per_second=data.hparams_frames_per_second(hparams))

      seq = self._SyntheticSequence(
          num_frames / data.hparams_frames_per_second(hparams) -
          crop_sequence_secs * 2,  # crop from both ends.
          i + constants.MIN_MIDI_PITCH,
          start_time=crop_sequence_secs)

      examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
      expected_inputs += self._ExampleToInputs(
          examples[-1],
          truncated_length,
          crop_training_sequence_to_notes=crop_sequence_secs > 0)
    self.assertEqual(expected_num_inputs, len(expected_inputs))

    with tempfile.NamedTemporaryFile() as temp_tfr:
      with tf.python_io.TFRecordWriter(temp_tfr.name) as writer:
        for ex in examples:
          writer.write(ex.SerializeToString())

      self._ValidateProvideBatch(
          temp_tfr.name,
          truncated_length,
          batch_size,
          expected_inputs,
          crop_training_sequence_to_notes=crop_sequence_secs > 0)
Esempio n. 17
0
  def validateProvideBatch_TFRecord(self,
                                    truncated_length,
                                    batch_size,
                                    lengths,
                                    expected_num_inputs):
    hparams = copy.deepcopy(constants.DEFAULT_HPARAMS)
    examples = []
    expected_inputs = []

    for i, length in enumerate(lengths):
      wav_samples = np.zeros(
          (np.int((length / data.hparams_frames_per_second(hparams)) *
                  constants.DEFAULT_SAMPLE_RATE), 1), np.float32)
      wav_data = audio_io.samples_to_wav_data(wav_samples,
                                              constants.DEFAULT_SAMPLE_RATE)

      num_frames = data.wav_to_num_frames(
          wav_data, frames_per_second=data.hparams_frames_per_second(hparams))

      seq = self._SyntheticSequence(
          num_frames / data.hparams_frames_per_second(hparams),
          i + constants.MIN_MIDI_PITCH)

      examples.append(self._FillExample(seq, wav_data, 'ex%d' % i))
      expected_inputs += self._ExampleToInputs(
          examples[-1],
          truncated_length)
    self.assertEqual(expected_num_inputs, len(expected_inputs))

    with tempfile.NamedTemporaryFile() as temp_rio:
      with tf.python_io.TFRecordWriter(temp_rio.name) as writer:
        for ex in examples:
          writer.write(ex.SerializeToString())

      self.validateProvideBatch(
          temp_rio.name,
          truncated_length,
          batch_size,
          expected_inputs)
Esempio n. 18
0
def reduce_audio_in_batch(tensor, hparams=None, is_training=True):
    instrument_count = hparams.timbre_training_max_instruments
    note_croppping_list = []
    instrument_family_list = []
    samples_list = []
    max_length = 0
    for i in range(instrument_count):
        pitch = tensor['pitch'][i]
        # Move the audio so there are different attack times.
        start_idx = tf.random.uniform((),
                                      minval=0,
                                      maxval=hparams.timbre_max_start_offset,
                                      dtype='int64')
        samples = K.concatenate(
            [tf.zeros(start_idx),
             tf.sparse.to_dense(tensor['audio'])[i]])

        end_idx = (
            start_idx +
            tf.py_function(_get_approx_note_length,
                           [tf.sparse.to_dense(tensor['audio'])[i]], tf.int64))
        if hparams.timbre_max_len and end_idx > hparams.timbre_max_len:
            samples = tf.slice(samples,
                               begin=[0],
                               size=[hparams.timbre_max_len])
            end_idx = hparams.timbre_max_len
        if len(samples) > max_length:
            max_length = len(samples)

        samples_list.append(samples)

        instrument_family = tensor['instrument_family'][i]
        note_croppping_list.append(
            timbre_dataset_util.NoteCropping(pitch=pitch,
                                             start_idx=start_idx,
                                             end_idx=end_idx))
        instrument_family_list.append(
            tf.one_hot(tf.cast(instrument_family, tf.int32),
                       hparams.timbre_num_classes))

    # Pad the end of the shorter audio clips.
    samples_list = list(
        map(lambda x: tf.pad(x, [[0, max_length - len(x)]]), samples_list))

    combined_samples = (
        tf.reduce_sum(tf.convert_to_tensor(samples_list), axis=0) /
        instrument_count)

    # Ensure all audios in batches are the same length.
    if hparams.timbre_max_len:
        pad_length = hparams.timbre_max_len
    else:
        pad_length = hparams.timbre_max_start_offset + 5 * hparams.sample_rate
    combined_samples = tf.pad(
        combined_samples, [[0, pad_length - tf.shape(combined_samples)[0]]])
    note_croppings = tf.convert_to_tensor(note_croppping_list, dtype=tf.int32)
    instrument_families = tf.convert_to_tensor(instrument_family_list,
                                               dtype=tf.int32)

    wav_data = tf.py_function(
        lambda x: audio_io.samples_to_wav_data(
            x.numpy(), sample_rate=hparams.sample_rate), [combined_samples],
        tf.string)

    return dict(
        audio=wav_data,
        note_croppings=note_croppings,
        instrument_families=instrument_families,
    )
 def _CreateSyntheticExample(self):
     sequence = self._CreateSyntheticSequence()
     wav_samples = np.zeros(9 * SAMPLE_RATE, np.float32)
     wav_data = audio_io.samples_to_wav_data(wav_samples, SAMPLE_RATE)
     return wav_data, sequence
Esempio n. 20
0
# }))
# to_process.append(example.SerializeToString())
# print('Processing complete ')

filename = 'richard.wav'
file_dir = './data/wav_format/' + filename
content = open(file_dir, 'rb')
uploaded = {file_dir: content.read()}  ##todo

to_process = []
for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))
    open(fn, 'w').write(uploaded[fn])
    wav_data = audio_io.samples_to_wav_data(
        librosa.util.normalize(
            librosa.core.load(fn, sr=hparams.sample_rate)[0]),
        hparams.sample_rate)

    example = tf.train.Example(features=tf.train.Features(
        feature={
            'id':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[fn.encode('utf-8')])),
            'sequence':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[music_pb2.NoteSequence().SerializeToString()])),
            'audio':
            tf.train.Feature(bytes_list=tf.train.BytesList(value=[wav_data])),
        }))
    to_process.append(example.SerializeToString())
    print('Processing complete for', fn)
 def _CreateSyntheticExample(self):
   sequence = self._CreateSyntheticSequence()
   wav_samples = np.zeros(2 * SAMPLE_RATE, np.float32)
   wav_data = audio_io.samples_to_wav_data(wav_samples, SAMPLE_RATE)
   return wav_data, sequence
Esempio n. 22
0
def process_record(wav_data,
                   ns,
                   example_id,
                   min_length=5,
                   max_length=20,
                   sample_rate=16000,
                   allow_empty_notesequence=False,
                   load_audio_with_librosa=False):
    """Split a record into chunks and create an example proto.

  To use the full length audio and notesequence, set min_length=0 and
  max_length=-1.

  Args:
    wav_data: audio data in WAV format.
    ns: corresponding NoteSequence.
    example_id: id for the example proto
    min_length: minimum length in seconds for audio chunks.
    max_length: maximum length in seconds for audio chunks.
    sample_rate: desired audio sample rate.
    allow_empty_notesequence: whether an empty NoteSequence is allowed.
    load_audio_with_librosa: Use librosa for sampling. Works with 24-bit wavs.

  Yields:
    Example protos.
  """
    try:
        if load_audio_with_librosa:
            samples = audio_io.wav_data_to_samples_librosa(
                wav_data, sample_rate)
        else:
            samples = audio_io.wav_data_to_samples(wav_data, sample_rate)
    except audio_io.AudioIOReadError as e:
        print('Exception %s', e)
        return
    samples = librosa.util.normalize(samples, norm=np.inf)

    # Add padding to samples if notesequence is longer.
    pad_to_samples = int(math.ceil(ns.total_time * sample_rate))
    padding_needed = pad_to_samples - samples.shape[0]
    if padding_needed > 5 * sample_rate:
        raise ValueError(
            'Would have padded {} more than 5 seconds to match note sequence total '
            'time. ({} original samples, {} sample rate, {} sample seconds, '
            '{} sequence seconds) This likely indicates a problem with the source '
            'data.'.format(example_id, samples.shape[0], sample_rate,
                           samples.shape[0] / sample_rate, ns.total_time))
    samples = np.pad(samples, (0, max(0, padding_needed)), 'constant')

    if max_length == min_length:
        splits = np.arange(0, ns.total_time, max_length)
    elif max_length > 0:
        splits = find_split_points(ns, samples, sample_rate, min_length,
                                   max_length)
    else:
        splits = [0, ns.total_time]
    velocities = [note.velocity for note in ns.notes]
    velocity_max = np.max(velocities) if velocities else 0
    velocity_min = np.min(velocities) if velocities else 0
    velocity_range = music_pb2.VelocityRange(min=velocity_min,
                                             max=velocity_max)

    for start, end in zip(splits[:-1], splits[1:]):
        if end - start < min_length:
            continue

        if start == 0 and end == ns.total_time:
            new_ns = ns
        else:
            new_ns = sequences_lib.extract_subsequence(ns, start, end)

        if not new_ns.notes and not allow_empty_notesequence:
            tf.logging.warning('skipping empty sequence')
            continue

        if start == 0 and end == ns.total_time:
            new_samples = samples
        else:
            # the resampling that happen in crop_wav_data is really slow
            # and we've already done it once, avoid doing it twice
            new_samples = audio_io.crop_samples(samples, sample_rate, start,
                                                end - start)
        new_wav_data = audio_io.samples_to_wav_data(new_samples, sample_rate)
        yield create_example(example_id,
                             new_ns,
                             new_wav_data,
                             velocity_range=velocity_range)
def generate_train_set(exclude_ids):
    """Generate the train TFRecord."""
    train_file_pairs = []
    for directory in TRAIN_DIRS:
        path = os.path.join(FLAGS.input_dir, directory)
        path = os.path.join(path, '*.wav')
        wav_files = glob.glob(path)
        # find matching mid files
        for wav_file in wav_files:
            base_name_root, _ = os.path.splitext(wav_file)
            mid_file = base_name_root + '.mid'
            if filename_to_id(wav_file) not in exclude_ids:
                train_file_pairs.append((wav_file, mid_file))

    train_output_name = os.path.join(FLAGS.output_dir,
                                     'maps_config2_train.tfrecord')

    with tf.python_io.TFRecordWriter(train_output_name) as writer:
        for idx, pair in enumerate(train_file_pairs):
            print("{} of {}: {}".format(idx, len(train_file_pairs), pair[0]))
            # load the wav data
            wav_data = tf.gfile.Open(pair[0], 'rb').read()
            samples = audio_io.wav_data_to_samples(wav_data, FLAGS.sample_rate)
            norm_samples = librosa.util.normalize(samples, norm=np.inf)

            # load the midi data and convert to a notesequence
            ns = midi_io.midi_file_to_note_sequence(pair[1])

            splits = create_dataset_util.find_split_points(
                ns, norm_samples, FLAGS.sample_rate, FLAGS.min_length,
                FLAGS.max_length)

            velocities = [note.velocity for note in ns.notes]
            velocity_max = np.max(velocities)
            velocity_min = np.min(velocities)
            new_velocity_tuple = music_pb2.VelocityRange(min=velocity_min,
                                                         max=velocity_max)

            for start, end in zip(splits[:-1], splits[1:]):
                if end - start < FLAGS.min_length:
                    continue

                new_ns = sequences_lib.extract_subsequence(ns, start, end)
                samples_start = int(start * FLAGS.sample_rate)
                samples_end = samples_start + int(
                    (end - start) * FLAGS.sample_rate)
                new_samples = samples[samples_start:samples_end]
                new_wav_data = audio_io.samples_to_wav_data(
                    new_samples, FLAGS.sample_rate)

                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'id':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[pair[0].encode()])),
                        'sequence':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_ns.SerializeToString()])),
                        'audio':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_wav_data])),
                        'velocity_range':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[new_velocity_tuple.SerializeToString()])),
                    }))
                writer.write(example.SerializeToString())