Esempio n. 1
0
def build_wavenet(batch_size=1, sample_length=64000):
    config = Config()

    x = tf.placeholder(tf.float32, shape=[batch_size, sample_length])
    graph = config.build({"wav": x}, is_training=False)
    graph.update({"X": x})
    return graph
Esempio n. 2
0
def load_style_nsynth(initial):
    """Load the NSynth autoencoder network for stylizing."""

    config = Config()
    with tf.device("/gpu:0"):
        initial = initial.reshape([1, -1, 1])  # [Batch_size, length, channel]
        x = tf.Variable(initial)
        graph = config.build({"wav": x}, is_training=False)
        graph.update({"X": x})

    return graph
def load_nsynth(batch_size=1, sample_length=40000):
    #     Load the NSynth autoencoder network.
    config = Config()
    print("Inside load_nsynth function")
    with tf.device("/device:GPU:0"):
        print("Loading nsynth")
        x = tf.placeholder(tf.float32, shape=[batch_size, sample_length])
        graph = config.build({"wav": x}, is_training=False)
        graph.update({"X": x})

    return graph
Esempio n. 4
0
def load_nsynth(batch_size=1, sample_length=64000):
    """Load the NSynth autoencoder network.

  Args:
    batch_size: Batch size number of observations to process. [1]
    sample_length: Number of samples in the input audio. [64000]
  Returns:
    graph: The network as a dict with input placeholder in {"X"}
  """
    config = Config()
    with tf.device("/gpu:0"):
        x = tf.placeholder(tf.float32, shape=[batch_size, sample_length])
        graph = config.build({"wav": x}, is_training=False)
        graph.update({"X": x})
    return graph
Esempio n. 5
0
def load_nsynth(batch_size=1, sample_length=64000):
  """Load the NSynth autoencoder network.

  Args:
    batch_size: Batch size number of observations to process. [1]
    sample_length: Number of samples in the input audio. [64000]
  Returns:
    graph: The network as a dict with input placeholder in {"X"}
  """
  config = Config()
  with tf.device("/gpu:0"):
    x = tf.placeholder(tf.float32, shape=[batch_size, sample_length])
    graph = config.build({"wav": x}, is_training=False)
    graph.update({"X": x})
  return graph
Esempio n. 6
0
def encode(wav_data, checkpoint_path, sample_length=64000):
    """Padded loading of a wave file.

  Args:
    wav_data: Numpy array [batch_size, sample_length]
    checkpoint_path: Location of the pretrained model.
    sample_length: The total length of the final wave file, padded with 0s.
  Returns:
    encoding: a [mb, 125, 16] encoding (for 64000 sample audio file).
    hop_length: Pooling size of the autoencoder.
  """
    if wav_data.ndim == 1:
        wav_data = np.expand_dims(wav_data, 0)
        batch_size = 1
    elif wav_data.ndim == 2:
        batch_size = wav_data.shape[0]

    # Load up the model for encoding and find the encoding of "wav_data"
    session_config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        hop_length = Config().ae_hop_length
        wav_data, sample_length = utils.trim_for_encoding(
            wav_data, sample_length, hop_length)
        net = load_nsynth(batch_size=batch_size, sample_length=sample_length)
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)
        encoding = sess.run(net["encoding"], feed_dict={net["X"]: wav_data})
    return encoding, hop_length
def encode(wav_data, checkpoint_path, sample_length=64000):
    #     Generate an array of embeddings from an array of audio.

    if wav_data.ndim == 1:
        wav_data = np.expand_dims(wav_data, 0)
        batch_size = 1
    elif wav_data.ndim == 2:
        batch_size = wav_data.shape[0]

    session_config = tf.ConfigProto(allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True
    # настройка выше - плохая, так как ест при необходимости всю память GPU, лучше
    #    session_config.gpu_options.per_process_gpu_memory_fraction = 0.4

    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        hop_length = Config().ae_hop_length
        #         hop_length - это pooling size

        wav_data, sample_length = trim_for_encoding(wav_data, sample_length,
                                                    hop_length)
        net = load_nsynth(batch_size=batch_size, sample_length=sample_length)
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)
        encodings = sess.run(net["encoding"], feed_dict={net["X"]: wav_data})
    return encodings
Esempio n. 8
0
def synthesize(encodings,
               save_paths,
               checkpoint_path="model.ckpt-200000",
               samples_per_save=10000,
               batch_i=None,
               total_batches=None):
    """Synthesize audio from an array of encodings.

  Args:
    encodings: Numpy array with shape [batch_size, time, dim].
    save_paths: Iterable of output file names.
    checkpoint_path: Location of the pretrained model. [model.ckpt-200000]
    samples_per_save: Save files after every amount of generated samples.
  """
    session_config = tf.ConfigProto(allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        net = load_fastgen_nsynth(batch_size=encodings.shape[0])
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)

        # Get lengths
        batch_size, encoding_length, _ = encodings.shape
        hop_length = Config().ae_hop_length
        total_length = encoding_length * hop_length

        # initialize queues w/ 0s
        sess.run(net["init_ops"])

        # Regenerate the audio file sample by sample
        audio_batch = np.zeros((batch_size, total_length), dtype=np.float32)
        audio = np.zeros([batch_size, 1])

        for sample_i in range(total_length):
            encoding_i = sample_i // hop_length
            audio = generate_audio_sample(sess, net, audio,
                                          encodings[:, encoding_i, :])
            audio_batch[:, sample_i] = audio[:, 0]

            if sample_i % 100 == 0:
                the_batch_i = batch_i if batch_i != None else 0
                the_total_batches = total_batches if total_batches != None else 1
                progress = (the_batch_i + float(sample_i + 1) /
                            total_length) / the_total_batches

                log_str = "{:.1f}% - ".format(progress * 100)
                if batch_i != None or total_batches != None:
                    log_str += "Batch: {}/{} - ".format(
                        (batch_i + 1) if batch_i != None else "?",
                        total_batches if total_batches != None else "?")
                log_str += "Sample: {}/{}".format(sample_i + 1, total_length)
                tf.logging.info(log_str)

            if sample_i % samples_per_save == 0 and save_paths:
                save_batch(audio_batch, save_paths)

        save_batch(audio_batch, save_paths)
Esempio n. 9
0
def synthesize(encodings,
               save_paths,
               checkpoint_path="model.ckpt-200000",
               samples_per_save=1000):
    """Synthesize audio from an array of embeddings.

    Args:
      encodings: Numpy array with shape [batch_size, time, dim].
      save_paths: Iterable of output file names.
      checkpoint_path: Location of the pretrained model. [model.ckpt-200000]
      samples_per_save: Save files after every amount of generated samples.
    """
    hop_length = Config().ae_hop_length
    # Get lengths
    batch_size = encodings.shape[0]
    encoding_length = encodings.shape[1]
    total_length = encoding_length * hop_length

    session_config = tf.ConfigProto(allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        net = load_fastgen_nsynth(batch_size=batch_size)
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)

        # initialize queues w/ 0s
        sess.run(net["init_ops"])

        # Regenerate the audio file sample by sample
        audio_batch = np.zeros(
            (
                batch_size,
                total_length,
            ), dtype=np.float32)
        audio = np.zeros([batch_size, 1])

        for sample_i in range(total_length):
            enc_i = sample_i // hop_length
            pmf = sess.run(
                [net["predictions"], net["push_ops"]],
                feed_dict={
                    net["X"]: audio,
                    net["encoding"]: encodings[:, enc_i, :]
                })[0]
            sample_bin = sample_categorical(pmf)
            audio = utils.inv_mu_law_numpy(sample_bin - 128)
            audio_batch[:, sample_i] = audio[:, 0]
            if sample_i % 100 == 0:
                tf.logging.info("Sample: %d" % sample_i)
            if sample_i % samples_per_save == 0:
                save_batch(audio_batch, save_paths)
    save_batch(audio_batch, save_paths)
def synthesize(encodings, save_paths, checkpoint_path, samples_per_save=1000):

    hop_length = Config().ae_hop_length
    # Get lengths
    batch_size = encodings.shape[0]
    encoding_length = encodings.shape[1]
    total_length = encoding_length * hop_length

    session_config = tf.ConfigProto(allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        net = load_fastgen_nsynth(batch_size=batch_size)
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)

        sess.run(net["init_ops"])

        audio_batch = np.zeros((batch_size, total_length), dtype=np.float32)
        audio = np.zeros([batch_size, 1])

        for sample_i in range(total_length):
            enc_i = sample_i // hop_length

            pmf = sess.run([net["predictions"], net["push_ops"]],
                           feed_dict={
                               net["X"]: audio,
                               net["encoding"]: encodings[:, enc_i, :]
                           })[0]
            sample_bin = sample_categorical(pmf)
            audio = inv_mu_law_numpy(sample_bin - 128)
            audio_batch[:, sample_i] = audio[:, 0]
            if sample_i % 100 == 0:
                print("Sample: {}".format(sample_i))
            if sample_i % samples_per_save == 0:
                save_batch(audio_batch, save_paths)
    save_batch(audio_batch, save_paths)
Esempio n. 11
0
def synthesize(source_file,
               checkpoint_path="model.ckpt-200000",
               out_file="synthesis.wav",
               sample_length=64000,
               samples_per_save=1000):
    """Resynthesize an input audio file.

  Args:
    source_file: Location of a wave or .npy file to load.
    checkpoint_path: Location of the pretrained model. [model.ckpt-200000]
    out_file: Location to save the synthesized wave file. [synthesis.wav]
    sample_length: Length of file to synthesize. [source_file.length]
    samples_per_save: Save a .wav after every amount of samples.

  Raises:
    RuntimeError: Source_file should be .wav or .npy.
  """
    if source_file.endswith(".npy"):
        encoding = np.load(source_file)
        hop_length = Config().ae_hop_length
    elif source_file.endswith(".wav"):
        # Audio to resynthesize
        wav_data = utils.load_audio(source_file, sample_length, sr=16000)
        # Load up the model for encoding and find the encoding
        encoding, hop_length = encode(wav_data,
                                      checkpoint_path,
                                      sample_length=sample_length)
        if encoding.ndim == 3:
            encoding = encoding[0]
    else:
        raise RuntimeError("File must be .wav or .npy")
    # Get lengths
    encoding_length = encoding.shape[0]
    total_length = encoding_length * hop_length

    session_config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Graph().as_default(), tf.Session(config=session_config) as sess:
        net = load_fastgen_nsynth()
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_path)

        # initialize queues w/ 0s
        sess.run(net["init_ops"])

        # Regenerate the audio file sample by sample
        wav_synth = np.zeros((total_length, ), dtype=np.float32)
        audio = np.float32(0)

        for sample_i in range(total_length):
            enc_i = sample_i // hop_length
            pmf = sess.run([net["predictions"], net["push_ops"]],
                           feed_dict={
                               net["X"]: np.atleast_2d(audio),
                               net["encoding"]: encoding[enc_i]
                           })[0]
            sample_bin = sample_categorical(pmf)
            audio = utils.inv_mu_law_numpy(sample_bin - 128)
            wav_synth[sample_i] = audio
            if sample_i % 100 == 0:
                tf.logging.info("Sample: %d" % sample_i)
            if sample_i % samples_per_save == 0:
                wavfile.write(out_file, 16000, wav_synth)

    wavfile.write(out_file, 16000, wav_synth)