def test_folder_wav(self, folder_name):
        """
        Function to extract multi pitch from file. Currently supports only HDF5 files.
        """
        sess = tf.Session()
        self.load_model(sess, log_dir=config.log_dir)

        file_list = [x for x in os.listdir(folder_name) if x.endswith('.wav') and not x.startswith('.')]

        count = 0

        unprocessable = []

        for file_name in file_list:
            try:
                mel = self.read_wav_file(os.path.join(folder_name, file_name))
                out_mel, out_f0, out_vuv = self.process_file(mel, sess)
                out_featss = np.concatenate((out_mel, out_f0, out_vuv), axis = -1)

                audio_out = utils.feats_to_audio(out_featss) 

                sf.write(os.path.join(config.output_dir,'./{}_output.wav'.format(file_name.split('/')[-1][:-4])), audio_out, config.fs)

                np.save(os.path.join(config.output_dir_np,file_name[:-4]), out_featss)

            except:
                unprocessable.append(file_name)

            count+=1

            utils.progress(count, len(file_list), "Files processed")
Beispiel #2
0
    def test_file_hdf5(self, file_name, file_name_singer):
        """
        Function to extract multi pitch from file. Currently supports only HDF5 files.
        """
        sess = tf.Session()
        self.load_model(sess, log_dir = config.log_dir)
        voc_stft, feats = self.read_hdf5_file(file_name)

        voc_stft_singer, feats_singer = self.read_hdf5_file(file_name_singer)
        out_feats = self.process_file(voc_stft, voc_stft_singer, sess)

        self.plot_features(feats, out_feats)

        import pdb;pdb.set_trace()

        out_featss = np.concatenate((out_feats[:feats.shape[0]], feats[:out_feats.shape[0],-2:]), axis = -1)

        utils.feats_to_audio(out_featss,file_name[:-4]+'gan_op.wav') 
    def test_file_hdf5(self, file_name, speaker_index, speaker_index_2):
        """
        Function to extract multi pitch from file. Currently supports only HDF5 files.
        """
        sess = tf.Session()
        self.load_model(sess, log_dir =  config.log_dir)
        mel = self.read_hdf5_file(file_name)



        out_mel = self.process_file(mel[:,:-2], speaker_index, speaker_index_2, sess)




        self.plot_features(mel, out_mel)





        synth = utils.query_yes_no("Synthesize output? ")

        if synth:
            gen_change = utils.query_yes_no("Change in gender? ")
            if gen_change:
                female_male = utils.query_yes_no("Female to male?")
                if female_male:
                    out_featss = np.concatenate((out_mel[:mel.shape[0]], mel[:out_mel.shape[0],-2:-1]-12, mel[:out_mel.shape[0],-1:]), axis = -1)
                else:
                    out_featss = np.concatenate((out_mel[:mel.shape[0]], mel[:out_mel.shape[0],-2:-1]+12, mel[:out_mel.shape[0],-1:]), axis = -1)
            else:
                out_featss = np.concatenate((out_mel[:mel.shape[0]], mel[:out_mel.shape[0],-2:]), axis = -1)

            audio_out = utils.feats_to_audio(out_featss) 

            sf.write('./{}_{}.wav'.format(file_name[:-5], config.singers[speaker_index_2]), audio_out, config.fs)

        synth_ori = utils.query_yes_no("Synthesize ground truth with vocoder? ")

        if synth_ori:
            audio = utils.feats_to_audio(mel) 
            sf.write('./{}_{}_ori.wav'.format(file_name[:-5], config.singers[speaker_index]), audio, config.fs)
Beispiel #4
0
    def test_model_yam(self):
        """
        Function to extract vocals from wav file.
        """
        sess = tf.Session()
        self.load_model(sess, log_dir=config.log_dir)
        voc_stft = utils.file_to_stft('./Bria_000_VoU67.wav')
        back_stft = utils.file_to_stft('./Bria_000_Back.wav')

        mix_stft = np.clip(
            (voc_stft[:len(back_stft)] + back_stft[:len(voc_stft)]) / 2, 0.0,
            1.0)
        feats = utils.input_to_feats('./Bria_000_VoU67.wav')
        out_feats = self.process_file(mix_stft, sess)
        self.plot_features(feats, out_feats)
        out_featss = np.concatenate(
            (out_feats[:feats.shape[0], :-2], feats[:out_feats.shape[0], -2:]),
            axis=-1)

        utils.feats_to_audio(out_featss[:3000], 'Bree_output')
Beispiel #5
0
    def test_file_hdf5(self, file_name, singer_index):
        """
        Function to extract multi pitch from file. Currently supports only HDF5 files.
        """
        sess = tf.Session()
        self.load_model(sess, log_dir = config.log_dir)
        feats, f0_nor, pho_target = self.read_hdf5_file(file_name)

        out_feats = self.process_file(f0_nor, pho_target, singer_index,  sess)

        self.plot_features(feats, out_feats)

        synth = utils.query_yes_no("Synthesize output? ")

        if synth:

            out_featss = np.concatenate((out_feats[:feats.shape[0]], feats[:out_feats.shape[0],-2:]), axis = -1)

            utils.feats_to_audio(out_featss,file_name[:-4]+'output') 
        synth_ori = utils.query_yes_no("Synthesize gorund truth with vocoder? ")

        if synth_ori:
            utils.feats_to_audio(feats,file_name[:-4]+'ground_truth') 
Beispiel #6
0
def synth_file(file_name="nus_MCUR_sing_10.hdf5",
               singer_index=0,
               file_path=config.wav_dir,
               show_plots=True,
               save_file="GBO"):

    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')
    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(config.batch_size,
                                                  config.max_phr_len, 66),
                                           name='input_placeholder')
        tf.summary.histogram('inputs', input_placeholder)

        output_placeholder = tf.placeholder(tf.float32,
                                            shape=(config.batch_size,
                                                   config.max_phr_len, 64),
                                            name='output_placeholder')

        f0_input_placeholder = tf.placeholder(tf.float32,
                                              shape=(config.batch_size,
                                                     config.max_phr_len, 1),
                                              name='f0_input_placeholder')

        rand_input_placeholder = tf.placeholder(tf.float32,
                                                shape=(config.batch_size,
                                                       config.max_phr_len, 4),
                                                name='rand_input_placeholder')

        prob = tf.placeholder_with_default(1.0, shape=())

        phoneme_labels = tf.placeholder(tf.int32,
                                        shape=(config.batch_size,
                                               config.max_phr_len),
                                        name='phoneme_placeholder')
        phone_onehot_labels = tf.one_hot(indices=tf.cast(
            phoneme_labels, tf.int32),
                                         depth=42)

        phoneme_labels_2 = tf.placeholder(tf.float32,
                                          shape=(config.batch_size,
                                                 config.max_phr_len, 42),
                                          name='phoneme_placeholder_1')
        # phone_onehot_labels = tf.one_hot(indices=tf.cast(phoneme_labels, tf.int32), depth=42)

        singer_labels = tf.placeholder(tf.float32,
                                       shape=(config.batch_size),
                                       name='singer_placeholder')
        singer_onehot_labels = tf.one_hot(indices=tf.cast(
            singer_labels, tf.int32),
                                          depth=12)

        with tf.variable_scope('phone_Model') as scope:
            # regularizer = tf.contrib.layers.l2_regularizer(scale=0.1)
            pho_logits = modules.phone_network(input_placeholder)
            pho_classes = tf.argmax(pho_logits, axis=-1)
            pho_probs = tf.nn.softmax(pho_logits)

        with tf.variable_scope('Final_Model') as scope:
            voc_output = modules.final_net(singer_onehot_labels,
                                           f0_input_placeholder,
                                           phoneme_labels_2)
            voc_output_decoded = tf.nn.sigmoid(voc_output)
            scope.reuse_variables()
            voc_output_3 = modules.final_net(singer_onehot_labels,
                                             f0_input_placeholder, pho_probs)
            voc_output_3_decoded = tf.nn.sigmoid(voc_output_3)

            # scope.reuse_variables()

            # voc_output_gen = modules.final_net(singer_onehot_labels, f0_input_placeholder, pho_probs)
            # voc_output_decoded_gen = tf.nn.sigmoid(voc_output_gen)

        # with tf.variable_scope('singer_Model') as scope:
        #     singer_embedding, singer_logits = modules.singer_network(input_placeholder, prob)
        #     singer_classes = tf.argmax(singer_logits, axis=-1)
        #     singer_probs = tf.nn.softmax(singer_logits)

        with tf.variable_scope('Generator') as scope:
            voc_output_2 = modules.GAN_generator(singer_onehot_labels,
                                                 phoneme_labels_2,
                                                 f0_input_placeholder,
                                                 rand_input_placeholder)

        with tf.variable_scope('Discriminator') as scope:
            D_fake = modules.GAN_discriminator(voc_output_2,
                                               singer_onehot_labels,
                                               phone_onehot_labels,
                                               f0_input_placeholder)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        # saver.restore(sess, './log/model.ckpt-3999')

        # import pdb;pdb.set_trace()

        voc_file = h5py.File(config.voice_dir + file_name, "r")

        # speaker_file = h5py.File(config.voice_dir+speaker_file, "r")

        feats = np.array(voc_file['feats'])
        # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1)

        f0 = feats[:, -2]

        # import pdb;pdb.set_trace()

        med = np.median(f0[f0 > 0])

        f0[f0 == 0] = med

        f0 = f0 - 12

        feats[:, -2] = feats[:, -2] - 12

        f0_nor = (f0 - min_feat[-2]) / (max_feat[-2] - min_feat[-2])

        feats = (feats - min_feat) / (max_feat - min_feat)

        pho_target = np.array(voc_file["phonemes"])

        in_batches_f0, nchunks_in = utils.generate_overlapadd(
            f0_nor.reshape(-1, 1))

        in_batches_pho, nchunks_in_pho = utils.generate_overlapadd(
            pho_target.reshape(-1, 1))

        in_batches_feat, kaka = utils.generate_overlapadd(feats)

        # import pdb;pdb.set_trace()

        out_batches_feats = []

        out_batches_feats_1 = []

        out_batches_feats_gan = []

        for in_batch_f0, in_batch_pho_target, in_batch_feat in zip(
                in_batches_f0, in_batches_pho, in_batches_feat):

            in_batch_f0 = in_batch_f0.reshape(
                [config.batch_size, config.max_phr_len, 1])

            in_batch_pho_target = in_batch_pho_target.reshape(
                [config.batch_size, config.max_phr_len])

            # in_batch_pho_target = sess.run(pho_probs, feed_dict = {input_placeholder: in_batch_feat})

            output_feats, output_feats_1, output_feats_gan = sess.run(
                [voc_output_decoded, voc_output_3_decoded, voc_output_2],
                feed_dict={
                    input_placeholder:
                    in_batch_feat,
                    f0_input_placeholder:
                    in_batch_f0,
                    phoneme_labels_2:
                    in_batch_pho_target,
                    singer_labels:
                    np.ones(30) * singer_index,
                    rand_input_placeholder:
                    np.random.normal(-1.0,
                                     1.0,
                                     size=[30, config.max_phr_len, 4])
                })

            out_batches_feats.append(output_feats)

            out_batches_feats_1.append(output_feats_1)

            out_batches_feats_gan.append(output_feats_gan / 2 + 0.5)

            # out_batches_voc_stft_phase.append(output_voc_stft_phase)

        # import pdb;pdb.set_trace()

        out_batches_feats = np.array(out_batches_feats)
        # import pdb;pdb.set_trace()
        out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in)

        out_batches_feats_1 = np.array(out_batches_feats_1)
        # import pdb;pdb.set_trace()
        out_batches_feats_1 = utils.overlapadd(out_batches_feats_1, nchunks_in)

        out_batches_feats_gan = np.array(out_batches_feats_gan)
        # import pdb;pdb.set_trace()
        out_batches_feats_gan = utils.overlapadd(out_batches_feats_gan,
                                                 nchunks_in)

        feats = feats * (max_feat - min_feat) + min_feat

        out_batches_feats = out_batches_feats * (max_feat[:-2] -
                                                 min_feat[:-2]) + min_feat[:-2]

        out_batches_feats_1 = out_batches_feats_1 * (
            max_feat[:-2] - min_feat[:-2]) + min_feat[:-2]

        out_batches_feats_gan = out_batches_feats_gan * (
            max_feat[:-2] - min_feat[:-2]) + min_feat[:-2]

        out_batches_feats = out_batches_feats[:len(feats)]

        out_batches_feats_1 = out_batches_feats_1[:len(feats)]

        out_batches_feats_gan = out_batches_feats_gan[:len(feats)]

        first_op = np.concatenate([out_batches_feats, feats[:, -2:]], axis=-1)

        pho_op = np.concatenate([out_batches_feats_1, feats[:, -2:]], axis=-1)

        gan_op = np.concatenate([out_batches_feats_gan, feats[:, -2:]],
                                axis=-1)

        # import pdb;pdb.set_trace()
        gan_op = np.ascontiguousarray(gan_op)

        pho_op = np.ascontiguousarray(pho_op)

        first_op = np.ascontiguousarray(first_op)

        if show_plots:

            plt.figure(1)

            ax1 = plt.subplot(311)

            plt.imshow(feats[:, :60].T, aspect='auto', origin='lower')

            ax1.set_title("Ground Truth Vocoder Features", fontsize=10)

            ax2 = plt.subplot(312, sharex=ax1, sharey=ax1)

            plt.imshow(out_batches_feats[:, :60].T,
                       aspect='auto',
                       origin='lower')

            ax2.set_title("Cross Entropy Output Vocoder Features", fontsize=10)

            ax3 = plt.subplot(313, sharex=ax1, sharey=ax1)

            ax3.set_title("GAN Vocoder Output Features", fontsize=10)

            # plt.imshow(out_batches_feats_1[:,:60].T,aspect='auto',origin='lower')
            #
            # plt.subplot(414, sharex = ax1, sharey = ax1)

            plt.imshow(out_batches_feats_gan[:, :60].T,
                       aspect='auto',
                       origin='lower')

            plt.figure(2)

            plt.subplot(211)

            plt.imshow(feats[:, 60:-2].T, aspect='auto', origin='lower')

            plt.subplot(212)

            plt.imshow(out_batches_feats[:, -4:].T,
                       aspect='auto',
                       origin='lower')

            plt.show()

            save_file = input(
                "Which files to synthesise G for GAN, B for Binary Entropy, "
                "O for original, or any combination. Default is None").upper(
                ) or "N"

        else:
            save_file = input(
                "Which files to synthesise G for GAN, B for Binary Entropy, "
                "O for original, or any combination. Default is all (GBO)"
            ).upper() or "GBO"

        if "G" in save_file:

            utils.feats_to_audio(gan_op[:, :], file_name[:-4] + 'gan_op.wav')

            print("GAN file saved to {}".format(
                os.path.join(config.val_dir, file_name[:-4] + 'gan_op.wav')))

        if "O" in save_file:

            utils.feats_to_audio(feats[:, :], file_name[:-4] + 'ori_op.wav')

            print("Originl file, resynthesized via WORLD vocoder saved to {}".
                  format(
                      os.path.join(config.val_dir,
                                   file_name[:-4] + 'ori_op.wav')))
            #
        if "B" in save_file:
            # # utils.feats_to_audio(pho_op[:5000,:],file_name[:-4]+'phoop.wav')
            #
            utils.feats_to_audio(first_op[:, :], file_name[:-4] + 'bce_op.wav')
            print("Binar cross entropy file saved to {}".format(
                os.path.join(config.val_dir, file_name[:-4] + 'bce_op.wav')))
Beispiel #7
0
def synth_file(file_name,
               file_path=config.wav_dir,
               show_plots=True,
               save_file=True):
    if file_name.startswith('ikala'):
        file_name = file_name[6:]
        file_path = config.wav_dir
        utils.write_ori_ikala(os.path.join(file_path, file_name), file_name)
        mode = 0
    elif file_name.startswith('mir'):
        file_name = file_name[4:]
        file_path = config.wav_dir_mir
        utils.write_ori_ikala(os.path.join(file_path, file_name), file_name)
        mode = 0
    elif file_name.startswith('med'):
        file_name = file_name[4:]
        file_path = config.wav_dir_med
        utils.write_ori_med(os.path.join(file_path, file_name), file_name)
        mode = 2
    else:
        mode = 1
        file_path = './'

    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')

    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    max_voc = np.array(stat_file["voc_stft_maximus"])
    min_voc = np.array(stat_file["voc_stft_minimus"])
    max_back = np.array(stat_file["back_stft_maximus"])
    min_back = np.array(stat_file["back_stft_minimus"])
    max_mix = np.array(max_voc) + np.array(max_back)

    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(config.batch_size,
                                                  config.max_phr_len,
                                                  config.input_features),
                                           name='input_placeholder')

        with tf.variable_scope('First_Model') as scope:
            harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder)

            # harmy = harm_1+harm

        if config.use_gan:
            with tf.variable_scope('Generator') as scope:
                gen_op = modules.GAN_generator(harm)
        # with tf.variable_scope('Discriminator') as scope:
        #     D_real = modules.GAN_discriminator(target_placeholder[:,:,:60],input_placeholder)
        #     scope.reuse_variables()
        #     D_fake = modules.GAN_discriminator(gen_op,input_placeholder)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir_m1)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)

        mix_stft = utils.file_to_stft(os.path.join(file_path, file_name),
                                      mode=mode)

        targs = utils.input_to_feats(os.path.join(file_path, file_name),
                                     mode=mode)

        import pdb
        pdb.set_trace()

        # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name))
        # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2])

        in_batches, nchunks_in = utils.generate_overlapadd(mix_stft)
        in_batches = in_batches / max_mix
        # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in)
        val_outer = []

        first_pred = []

        cleaner = []

        gan_op = []

        for in_batch in in_batches:
            val_harm, val_ap, val_f0, val_vuv = sess.run(
                [harm, ap, f0, vuv], feed_dict={input_placeholder: in_batch})
            if config.use_gan:
                val_op = sess.run(gen_op,
                                  feed_dict={input_placeholder: in_batch})

                gan_op.append(val_op)

            # first_pred.append(harm1)
            # cleaner.append(val_harm)
            val_harm = val_harm
            val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv),
                                      axis=-1)
            val_outer.append(val_outs)

        val_outer = np.array(val_outer)
        val_outer = utils.overlapadd(val_outer, nchunks_in)
        val_outer[:, -1] = np.round(val_outer[:, -1])
        val_outer = val_outer[:targs.shape[0], :]
        val_outer = np.clip(val_outer, 0.0, 1.0)

        import pdb
        pdb.set_trace()

        #Test purposes only
        # first_pred = np.array(first_pred)
        # first_pred = utils.overlapadd(first_pred, nchunks_in)

        # cleaner = np.array(cleaner)
        # cleaner = utils.overlapadd(cleaner, nchunks_in)

        if config.use_gan:
            gan_op = np.array(gan_op)
            gan_op = utils.overlapadd(gan_op, nchunks_in)

        targs = (targs - min_feat) / (max_feat - min_feat)

        # first_pred = (first_pred-min_feat[:60])/(max_feat[:60]-min_feat[:60])
        # cleaner = (cleaner-min_feat[:60])/(max_feat[:60]-min_feat[:60])

        # ax1 = plt.subplot(311)
        # plt.imshow(targs[:,:60].T, origin='lower', aspect='auto')
        # # ax1.set_title("Harmonic Spectral Envelope", fontsize = 10)
        # ax2 = plt.subplot(312)
        # plt.imshow(targs[:,60:64].T, origin='lower', aspect='auto')
        # # ax2.set_title("Aperiodicity Envelope", fontsize = 10)
        # ax3 = plt.subplot(313)
        # plt.plot(targs[:,-2])
        # ax3.set_title("Fundamental Frequency Contour", fontsize = 10)
        if show_plots:

            # import pdb;pdb.set_trace()

            ins = val_outer[:, :60]
            outs = targs[:, :60]
            plt.figure(1)
            ax1 = plt.subplot(211)
            plt.imshow(ins.T, origin='lower', aspect='auto')
            ax1.set_title("Predicted Harm ", fontsize=10)
            ax2 = plt.subplot(212)
            plt.imshow(outs.T, origin='lower', aspect='auto')
            ax2.set_title("Ground Truth Harm ", fontsize=10)
            # ax1 = plt.subplot(413)
            # plt.imshow(first_pred.T, origin='lower', aspect='auto')
            # ax1.set_title("Initial Prediction ", fontsize = 10)
            # ax2 = plt.subplot(412)
            # plt.imshow(cleaner.T, origin='lower', aspect='auto')
            # ax2.set_title("Residual Added ", fontsize = 10)

            if config.use_gan:
                plt.figure(5)
                ax1 = plt.subplot(411)
                plt.imshow(ins.T, origin='lower', aspect='auto')
                ax1.set_title("Predicted Harm ", fontsize=10)
                ax2 = plt.subplot(414)
                plt.imshow(outs.T, origin='lower', aspect='auto')
                ax2.set_title("Ground Truth Harm ", fontsize=10)
                ax1 = plt.subplot(412)
                plt.imshow(gan_op.T, origin='lower', aspect='auto')
                ax1.set_title("GAN output ", fontsize=10)
                ax1 = plt.subplot(413)
                plt.imshow((gan_op[:ins.shape[0], :] + ins).T,
                           origin='lower',
                           aspect='auto')
                ax1.set_title("GAN output ", fontsize=10)

            plt.figure(2)
            ax1 = plt.subplot(211)
            plt.imshow(val_outer[:, 60:-2].T, origin='lower', aspect='auto')
            ax1.set_title("Predicted Aperiodic Part", fontsize=10)
            ax2 = plt.subplot(212)
            plt.imshow(targs[:, 60:-2].T, origin='lower', aspect='auto')
            ax2.set_title("Ground Truth Aperiodic Part", fontsize=10)

            plt.figure(3)

            f0_output = val_outer[:, -2] * (
                (max_feat[-2] - min_feat[-2]) + min_feat[-2])
            f0_output = f0_output * (1 - targs[:, -1])
            f0_output[f0_output == 0] = np.nan
            plt.plot(f0_output, label="Predicted Value")
            f0_gt = targs[:, -2] * (
                (max_feat[-2] - min_feat[-2]) + min_feat[-2])
            f0_gt = f0_gt * (1 - targs[:, -1])
            f0_gt[f0_gt == 0] = np.nan
            plt.plot(f0_gt, label="Ground Truth")
            f0_difference = np.nan_to_num(abs(f0_gt - f0_output))
            f0_greater = np.where(f0_difference > config.f0_threshold)
            diff_per = f0_greater[0].shape[0] / len(f0_output)
            plt.suptitle("Percentage correct = " +
                         '{:.3%}'.format(1 - diff_per))
            # import pdb;pdb.set_trace()

            # import pdb;pdb.set_trace()
            # uu = f0_sac[:,0]*(1-f0_sac[:,1])
            # uu[uu == 0] = np.nan
            # plt.plot(uu, label="Sac f0")
            plt.legend()
            plt.figure(4)
            ax1 = plt.subplot(211)
            plt.plot(val_outer[:, -1])
            ax1.set_title("Predicted Voiced/Unvoiced", fontsize=10)
            ax2 = plt.subplot(212)
            plt.plot(targs[:, -1])
            ax2.set_title("Ground Truth Voiced/Unvoiced", fontsize=10)
            plt.show()
        if save_file:

            val_outer = np.ascontiguousarray(val_outer *
                                             (max_feat - min_feat) + min_feat)
            targs = np.ascontiguousarray(targs * (max_feat - min_feat) +
                                         min_feat)

            # import pdb;pdb.set_trace()

            # val_outer = np.ascontiguousarray(utils.denormalize(val_outer,'feats', mode=config.norm_mode_out))
            try:
                utils.feats_to_audio(val_outer,
                                     file_name[:-4] + '_synth_pred_f0')
                print("File saved to %s" % config.val_dir + file_name[:-4] +
                      '_synth_pred_f0.wav')
            except:
                print("Couldn't synthesize with predicted f0")
            try:
                val_outer[:, -2:] = targs[:, -2:]
                utils.feats_to_audio(val_outer,
                                     file_name[:-4] + '_synth_ori_f0')
                print("File saved to %s" % config.val_dir + file_name[:-4] +
                      '_synth_ori_f0.wav')
            except:
                print("Couldn't synthesize with original f0")
Beispiel #8
0
def synth_file(file_name="015.hdf5",
               singer_index=0,
               file_path=config.wav_dir,
               show_plots=True):

    stat_file = h5py.File('./stats.hdf5', mode='r')
    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    with tf.Graph().as_default():

        output_placeholder = tf.placeholder(tf.float32,
                                            shape=(config.batch_size,
                                                   config.max_phr_len, 64),
                                            name='output_placeholder')

        f0_output_placeholder = tf.placeholder(tf.float32,
                                               shape=(config.batch_size,
                                                      config.max_phr_len, 1),
                                               name='f0_output_placeholder')

        f0_input_placeholder = tf.placeholder(tf.float32,
                                              shape=(config.batch_size,
                                                     config.max_phr_len),
                                              name='f0_input_placeholder')
        f0_onehot_labels = tf.one_hot(indices=tf.cast(f0_input_placeholder,
                                                      tf.int32),
                                      depth=len(config.notes))

        f0_context_placeholder = tf.placeholder(tf.float32,
                                                shape=(config.batch_size,
                                                       config.max_phr_len, 1),
                                                name='f0_context_placeholder')

        phone_context_placeholder = tf.placeholder(
            tf.float32,
            shape=(config.batch_size, config.max_phr_len, 1),
            name='phone_context_placeholder')

        rand_input_placeholder = tf.placeholder(tf.float32,
                                                shape=(config.batch_size,
                                                       config.max_phr_len, 64),
                                                name='rand_input_placeholder')

        prob = tf.placeholder_with_default(1.0, shape=())

        phoneme_labels = tf.placeholder(tf.int32,
                                        shape=(config.batch_size,
                                               config.max_phr_len),
                                        name='phoneme_placeholder')
        phone_onehot_labels = tf.one_hot(indices=tf.cast(
            phoneme_labels, tf.int32),
                                         depth=len(config.phonemas))

        with tf.variable_scope('Generator_feats') as scope:
            inputs = tf.concat([
                phone_onehot_labels, f0_onehot_labels,
                phone_context_placeholder, f0_context_placeholder
            ],
                               axis=-1)
            voc_output = modules.GAN_generator(inputs)

        with tf.variable_scope('Generator_f0') as scope:
            inputs = tf.concat([
                phone_onehot_labels, f0_onehot_labels,
                phone_context_placeholder, f0_context_placeholder,
                output_placeholder
            ],
                               axis=-1)
            # inputs = tf.concat([phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, (voc_output/2)+0.5], axis = -1)
            f0_output = modules.GAN_generator_f0(inputs)

            scope.reuse_variables()

            inputs = tf.concat([
                phone_onehot_labels, f0_onehot_labels,
                phone_context_placeholder, f0_context_placeholder,
                (voc_output / 2) + 0.5
            ],
                               axis=-1)
            f0_output_2 = modules.GAN_generator_f0(inputs)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        # saver.restore(sess, './log/model.ckpt-3999')

        # import pdb;pdb.set_trace()

        feat_file = h5py.File(config.feats_dir + file_name, "r")

        # speaker_file = h5py.File(config.voice_dir+speaker_file, "r")

        # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1)

        feats = feat_file["world_feats"][()]

        feats = (feats - min_feat) / (max_feat - min_feat)

        phones = feat_file["phonemes"][()]

        notes = feat_file["notes"][()]

        phones = np.concatenate([phones, notes], axis=-1)

        # in_batches_f0, nchunks_in = utils.generate_overlapadd(f0_nor.reshape(-1,1))

        in_batches_pho, nchunks_in = utils.generate_overlapadd(phones)

        in_batches_feat, kaka = utils.generate_overlapadd(feats)

        noters = np.expand_dims(
            np.array([config.notes[int(x)] for x in notes[:, 0]]), 1)

        out_batches_feats = []

        out_batches_f0 = []

        for conds, feat in zip(in_batches_pho, in_batches_feat):
            # import pdb;pdb.set_trace()
            f0 = conds[:, :, 2]
            phones = conds[:, :, 0]
            f0_context = conds[:, :, -1:]
            phones_context = conds[:, :, 1:2]

            feed_dict = {
                f0_input_placeholder: f0,
                phoneme_labels: phones,
                phone_context_placeholder: phones_context,
                f0_context_placeholder: f0_context,
                output_placeholder: feat[:, :, :-2]
            }

            output_feats_gan, output_f0 = sess.run([voc_output, f0_output_2],
                                                   feed_dict=feed_dict)

            out_batches_feats.append(output_feats_gan / 2 + 0.5)
            out_batches_f0.append(output_f0 / 2 + 0.5)

            # out_batches_voc_stft_phase.append(output_voc_stft_phase)

        out_batches_feats = np.array(out_batches_feats)
        out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in)

        out_batches_f0 = np.array(out_batches_f0)
        out_batches_f0 = utils.overlapadd(out_batches_f0, nchunks_in)

        feats = feats * (max_feat - min_feat) + min_feat

        out_batches_feats = out_batches_feats * (max_feat[:-2] -
                                                 min_feat[:-2]) + min_feat[:-2]

        out_batches_feats = out_batches_feats[:len(feats)]

        out_batches_f0 = out_batches_f0 * (max_feat[-2] -
                                           min_feat[-2]) + min_feat[-2]

        out_batches_f0 = out_batches_f0[:len(feats)]

        diff_1 = (out_batches_f0 - noters) * (1 - feats[:, -1:])

        diff_2 = (feats[:, -2:-1] - noters) * (1 - feats[:, -1:])

        print("Mean predicted note deviation {}".format(diff_1.mean()))
        print("Mean original note deviation {}".format(diff_2.mean()))

        print("STD predicted note deviation {}".format(diff_1.std()))
        print("STD original note deviation {}".format(diff_2.std()))

        plt.figure(1)
        plt.suptitle("F0 contour")
        plt.plot(out_batches_f0, label='Predicted F0')
        plt.plot(feats[:, -2], label="Ground Truth F0")
        plt.plot(noters, label="Input Midi Note")
        # plt.plot(phones[:,])
        plt.legend()

        # plt.figure(2)
        # ax1 = plt.subplot(211)

        # plt.imshow(feats[:,:60].T,aspect='auto',origin='lower')

        # ax1.set_title("Ground Truth Vocoder Features", fontsize=10)

        # ax2 = plt.subplot(212, sharex = ax1, sharey = ax1)

        # plt.imshow(out_batches_feats[:,:60].T,aspect='auto',origin='lower')

        # ax2.set_title("GAN Output Vocoder Features", fontsize=10)

        plt.show()

        import pdb
        pdb.set_trace()

        # out_batches_feats_gan= out_batches_feats_gan[:len(feats)]

        first_op = np.concatenate(
            [out_batches_feats, out_batches_f0, feats[:, -1:]], axis=-1)
        second_op = np.concatenate(
            [feats[:, 60:64], out_batches_f0, feats[:, -1:]], axis=-1)

        # pho_op = np.concatenate([out_batches_feats_1,feats[:,-2:]], axis = -1)

        # gan_op = np.concatenate([out_batches_feats_gan,feats[:,-2:]], axis = -1)

        # import pdb;pdb.set_trace()
        # gan_op = np.ascontiguousarray(gan_op)

        # pho_op = np.ascontiguousarray(pho_op)

        first_op = np.ascontiguousarray(first_op)
        second_op = np.ascontiguousarray(second_op)

        utils.feats_to_audio(first_op, file_name[:-4] + '_gan_op')
        print("Full output saved to {}".format(
            os.path.join(config.val_dir, file_name[:-4] + '_gan_op.wav')))
        utils.feats_to_audio(first_op, file_name[:-4] + '_F0_op')
        print("Only F0 saved to {}".format(
            os.path.join(config.val_dir, file_name[:-4] + '_F0_op.wav')))
def synth_file(file_name,
               file_path=config.wav_dir,
               show_plots=True,
               save_file=True):
    debug = False
    if file_name.startswith('ikala'):
        file_name = file_name[6:]
        file_path = config.wav_dir
        utils.write_ori_ikala(os.path.join(file_path, file_name), file_name)
    elif file_name.startswith('mir'):
        file_name = file_name[4:]
        file_path = config.wav_dir_mir
        utils.write_ori_ikala(os.path.join(file_path, file_name), file_name)
    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')

    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    max_voc = np.array(stat_file["voc_stft_maximus"])
    min_voc = np.array(stat_file["voc_stft_minimus"])
    max_back = np.array(stat_file["back_stft_maximus"])
    min_back = np.array(stat_file["back_stft_minimus"])
    max_mix = np.array(max_voc) + np.array(max_back)

    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(1, config.max_phr_len,
                                                  config.input_features),
                                           name='input_placeholder')
        tf.summary.histogram('conditioning', input_placeholder)
        input_placeholder_2 = tf.placeholder(tf.float32,
                                             shape=(1, config.max_phr_len,
                                                    config.output_features),
                                             name='input_placeholder')
        tf.summary.histogram('inputs', input_placeholder)

        output, vuv = modules.wavenet(input_placeholder_2, input_placeholder)

        saver = tf.train.Saver()

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)

        mix_stft = utils.file_to_stft(os.path.join(file_path, file_name))

        mix_stft = mix_stft / max_mix

        lent = len(mix_stft)

        mix_stft_in = np.pad(mix_stft, [(0, config.max_phr_len), (0, 0)],
                             'constant')

        # import pdb;pdb.set_trace()

        targs = utils.input_to_feats(os.path.join(file_path, file_name))

        outputs = np.zeros((1, config.max_phr_len, config.output_features))

        opus = []

        i = 0

        for index in range(lent):
            inputs = mix_stft_in[index:index + config.max_phr_len, :]

            if outputs.shape[1] > config.max_phr_len:
                inpy = inputs.reshape(1, config.max_phr_len, -1)
                outpy = outputs[:, -config.max_phr_len:, :]
            else:
                inpy = inputs.reshape(1, config.max_phr_len, -1)
                outpy = outputs

            # import pdb;pdb.set_trace()
            op, vu = sess.run([output, vuv],
                              feed_dict={
                                  input_placeholder: inpy,
                                  input_placeholder_2: outpy
                              })
            op = np.concatenate((op, vu), axis=-1)
            if debug:
                plt.figure(1)
                plt.subplot(311)
                plt.imshow(np.log(inputs.T), aspect='auto', origin='lower')
                plt.subplot(312)
                plt.imshow(targs[index:index + config.max_phr_len, :].T,
                           aspect='auto',
                           origin='lower')
                plt.subplot(313)
                plt.imshow(outpy.reshape(config.max_phr_len, -1).T,
                           aspect='auto',
                           origin='lower')
                plt.show()
                import pdb
                pdb.set_trace()

            if index > config.max_phr_len:
                outputs = np.append(outputs, op[:, -1:, :], axis=1)
            else:
                outputs[:, :index, :] = op[:, :index, :]

        val_outer = outputs[0]
        val_outer[:, -1] = np.round(val_outer[:, -1])
        val_outer = val_outer[:targs.shape[0], :]
        val_outer = np.clip(val_outer, 0.0, 1.0)

        #Test purposes only

        # targs = utils.normalize(targs, 'feats', mode=config.norm_mode_out)
        targs = (targs - min_feat) / (max_feat - min_feat)

        if show_plots:

            # import pdb;pdb.set_trace()

            ins = val_outer[:, :60]
            outs = targs[:, :60]
            plt.figure(1)
            ax1 = plt.subplot(211)
            plt.imshow(ins.T, origin='lower', aspect='auto')
            ax1.set_title("Predicted Harm ", fontsize=10)
            ax2 = plt.subplot(212)
            plt.imshow(outs.T, origin='lower', aspect='auto')
            ax2.set_title("Ground Truth Harm ", fontsize=10)
            plt.figure(2)
            ax1 = plt.subplot(211)
            plt.imshow(val_outer[:, 60:-2].T, origin='lower', aspect='auto')
            ax1.set_title("Predicted Aperiodic Part", fontsize=10)
            ax2 = plt.subplot(212)
            plt.imshow(targs[:, 60:-2].T, origin='lower', aspect='auto')
            ax2.set_title("Ground Truth Aperiodic Part", fontsize=10)
            plt.figure(3)
            uu = val_outer[:, -2] * (1 - targs[:, -1])
            uu[uu == 0] = np.nan
            plt.plot(uu, label="Predicted Value")
            uu = targs[:, -2] * (1 - targs[:, -1])
            uu[uu == 0] = np.nan
            plt.plot(uu, label="Ground Truth")
            # uu = f0_sac[:,0]*(1-f0_sac[:,1])
            # uu[uu == 0] = np.nan
            # plt.plot(uu, label="Sac f0")
            plt.legend()
            plt.figure(4)
            ax1 = plt.subplot(211)
            plt.plot(val_outer[:, -1])
            ax1.set_title("Predicted Voiced/Unvoiced", fontsize=10)
            ax2 = plt.subplot(212)
            plt.plot(targs[:, -1])
            ax2.set_title("Ground Truth Voiced/Unvoiced", fontsize=10)
            plt.show()
        if save_file:

            val_outer = np.ascontiguousarray(val_outer *
                                             (max_feat - min_feat) + min_feat)
            targs = np.ascontiguousarray(targs * (max_feat - min_feat) +
                                         min_feat)

            # val_outer = np.ascontiguousarray(utils.denormalize(val_outer,'feats', mode=config.norm_mode_out))
            try:
                utils.feats_to_audio(val_outer,
                                     file_name[:-4] + '_synth_pred_f0')
                print("File saved to %s" % config.val_dir + file_name[:-4] +
                      '_synth_pred_f0.wav')
            except:
                print("Couldn't synthesize with predicted f0")
            try:
                val_outer[:, -2:] = targs[:, -2:]
                utils.feats_to_audio(val_outer,
                                     file_name[:-4] + '_synth_ori_f0')
                print("File saved to %s" % config.val_dir + file_name[:-4] +
                      '_synth_ori_f0.wav')
            except:
                print("Couldn't synthesize with original f0")
    def test_file_wav(self, file_name, acap_file=None):
        """
        Function to extract multi pitch from file. Currently supports only HDF5 files.
        """
        sess = tf.Session()
        self.load_model(sess, log_dir =  config.log_dir)
        mel = self.read_wav_file(file_name)

        if acap_file:
            feats = self.read_acap_file(acap_file)
        else:
            feats = None


        out_mel, out_atb, out_vuv = self.process_file(mel, sess)
        if config.f0_mode == "discrete":
            est_freq = utils.to_viterbi_cents(out_atb)

            est_freq = est_freq/100
            est_freq = est_freq + 12*np.log2(10) - 12*np.log2(440)
            est_freq = est_freq + 69 

        plt.figure(1)

        if acap_file:

            ax1 = plt.subplot(311)

            plt.imshow(np.log(mel.T),aspect='auto',origin='lower')

            ax1.set_title("Input STFT", fontsize=10)

            ax2 = plt.subplot(312, sharex = ax1)

            plt.imshow(feats[:,:64].T,aspect='auto',origin='lower')

            ax2.set_title("Ground Truth Vocoder Features", fontsize=10)

            ax3 = plt.subplot(313, sharex = ax1, sharey = ax2)

            plt.imshow(out_mel[:feats.shape[0]].T,aspect='auto',origin='lower')

            ax3.set_title("Output Vocoder Features", fontsize=10)


            plt.figure(4)


            ax1 = plt.subplot(211)

            plt.plot(feats[:,-1])

            ax1.set_title("Ground Truth VUV", fontsize=10)

            ax2 = plt.subplot(212, sharex = ax1, sharey = ax1)

            plt.plot(out_vuv)

            ax1.set_title("Output VUV", fontsize=10)


            plt.figure(3)



            if config.f0_mode == "cont":
                f0_output = out_atb[:feats.shape[0], -1]
            else:
                f0_output = est_freq[:feats.shape[0]]

            f0_output = f0_output*(1-out_vuv[:feats.shape[0],0])
            f0_output[f0_output == 0] = np.nan

            plt.plot(f0_output, label = "Predicted Value")
            f0_gt = feats[:,-2]
            f0_gt = f0_gt*(1-feats[:,-1])
            f0_gt[f0_gt == 0] = np.nan
            plt.plot(f0_gt, label="Ground Truth")
            f0_difference = np.nan_to_num(abs(f0_gt-f0_output))
            f0_greater = np.where(f0_difference>config.f0_threshold)
            diff_per = f0_greater[0].shape[0]/len(f0_output)
            plt.suptitle("Percentage correct = "+'{:.3%}'.format(1-diff_per))
            plt.legend()

            plt.show()

        else:


            ax1 = plt.subplot(211)

            plt.imshow(np.log(mel.T),aspect='auto',origin='lower')

            ax1.set_title("Input STFT", fontsize=10)

            ax1 = plt.subplot(212)

            plt.imshow(out_mel.T,aspect='auto',origin='lower')

            ax1.set_title("Output Vocoder Features", fontsize=10)

            plt.show()



        if config.f0_mode == "cont":

            audio_out = utils.feats_to_audio(np.concatenate((out_mel, out_atb,  out_vuv) , axis = -1))
            sf.write('./{}_ss_pred.wav'.format(file_name.split('/')[-1][:-4]), audio_out, config.fs)

        elif config.f0_mode == "discrete":

            audio_out = utils.feats_to_audio(np.concatenate((out_mel[:feats.shape[0]], np.expand_dims(est_freq,-1)[:feats.shape[0]], feats[:,-1:]) , axis = -1))
            sf.write('./{}_ss_pred_dis.wav'.format(file_name.split('/')[-1][:-4]), audio_out, config.fs)

        if acap_file:

            audio = utils.feats_to_audio(feats) 
            sf.write('./{}_ori.wav'.format(file_name.split('/')[-1][:-4]), audio, config.fs)

        np.save(file_name.split('/')[-1][:-4], out_mel)