Ejemplo n.º 1
0
    def create_tfrecord(self, dataset_list, tfrecord_path):
        print("Start converting...")
        options = tf.python_io.\
            TFRecordOptions(compression_type=tf.python_io.TFRecordCompressionType.GZIP)
        writer = tf.python_io.TFRecordWriter(path=tfrecord_path, options=options)

        def round_up(length, reduction_factor):
            remain = length % reduction_factor
            return reduction_factor - remain

        for dataset in dataset_list:
            audio_file_path = dataset["audio_file_path"]

            _, mel, lin = ap.get_features(audio_file_path)
            padding_length = round_up(np.shape(mel)[-1], hp.reduction_factor)
            mel = np.pad(mel, ((0, 0), (0, padding_length)), "constant", constant_values=0.)
            lin = np.pad(lin, ((0, 0), (0, padding_length)), "constant", constant_values=0.)

            script = dataset["normalized_script"]
            script = sp.encode_script(script, char2idx=self._char2idx)
            script = np.asarray(script, dtype=np.int32)

            example = tf.train.Example(
                features=tf.train.Features(
                    feature={
                        "mel": _bytes_feature(mel.tostring()),
                        "lin": _bytes_feature(lin.tostring()),
                        "script": _bytes_feature(script.tostring())
                    }
                )
            )
            writer.write(example.SerializeToString())

        writer.close()
        print("Done...")
    def _generate_batch(self):
        def round_up(length, reduction_factor):
            remain = length % reduction_factor
            return reduction_factor - remain

        while True:
            random_dataset_list = self._dataset_list[:]
            random.shuffle(random_dataset_list)
            for dataset in random_dataset_list:
                audio_file_path = dataset["audio_file_path"]

                _, mel, lin = ap.get_features(audio_file_path)
                padding_length = round_up(
                    np.shape(mel)[-1], hp.reduction_factor)
                mel = np.pad(mel, ((0, 0), (0, padding_length)),
                             "constant",
                             constant_values=0.)
                lin = np.pad(lin, ((0, 0), (0, padding_length)),
                             "constant",
                             constant_values=0.)

                spk = np.asarray(self._spk2idx[dataset["spk"]], dtype=np.int32)

                script = dataset["script"]
                script = sp.encode_script(script, char2idx=self._char2idx)
                script = np.asarray(script, dtype=np.int32)

                yield mel, lin, spk, script
Ejemplo n.º 3
0
def main():
    dataset_list = kss.get_dataset_list(hp.dataset_path)
    idx2char, char2idx, idx2ord = kss.get_idx2char(dataset_list)

    print(len(dataset_list))
    print(len(idx2char))
    print(idx2char)
    #print([(char.encode("utf-8"), char) for char in idx2char])
    #print([ord(char) for char in idx2char])
    #print([elem for elem in zip(idx2char, idx2ord)])

    random_dataset_list = dataset_list[:]
    random.shuffle(random_dataset_list)
    print([dataset["script"] for dataset in random_dataset_list[:5]])
    print([dataset["normalized_script"] for dataset in random_dataset_list[:5]])

    mels = []
    lins = []

    audio, mel, lin = ap.get_features(dataset_list[0]["audio_file_path"])
    audio_inverse = ap.lin_to_audio(lin)

    padding_length = round_up(np.shape(mel)[-1], hp.reduction_factor)
    mel = np.pad(mel, ((0, 0), (0, padding_length)), "constant", constant_values=0.)
    lin = np.pad(lin, ((0, 0), (0, padding_length)), "constant", constant_values=0.)

    print(dataset_list[0]["audio_file_path"])

    librosa.output.write_wav(
        "%s/audio_sample.wav" % hp.audio_save_path, audio, hp.sampling_rate
    )
    librosa.output.write_wav(
        "%s/audio_inverse_sample.wav" % hp.audio_save_path, audio_inverse, hp.sampling_rate
    )

    fig = plt.figure()
    librosa.display.specshow(lin,
                             y_axis="linear",
                             x_axis="time",
                             sr=hp.sampling_rate,
                             hop_length=hp.hop_length)
    plt.colorbar()
    plt.tight_layout()
    fig.savefig("%s/lin_sample.png" % hp.logdir_root)
    plt.clf()
    plt.cla()
    plt.close()

    fig = plt.figure()
    librosa.display.specshow(mel,
                             y_axis="linear",
                             x_axis="time",
                             sr=hp.sampling_rate,
                             hop_length=hp.hop_length)
    plt.colorbar()
    plt.tight_layout()
    fig.savefig("%s/mel_sample.png" % hp.logdir_root)
    plt.clf()
    plt.cla()
    plt.close()

    random.shuffle(dataset_list)

    for dataset in dataset_list[:10]:
        _, mel, lin = ap.get_features(dataset["audio_file_path"])
        padding_length = round_up(np.shape(mel)[-1], hp.reduction_factor)
        #mel = np.pad(mel, ((0, 0), (0, padding_length)), "constant", constant_values=0.)
        #lin = np.pad(lin, ((0, 0), (0, padding_length)), "constant", constant_values=0.)
        mel = np.reshape(mel, [-1]).tolist()
        lin = np.reshape(lin, [-1]).tolist()
        mels = mels + mel
        lins = lins + lin

    fig = plt.figure()
    n , bins, patches = plt.hist(mels, 50, density=1, facecolor="blue", alpha=0.75)
    fig.savefig("%s/hist_mel.png" % hp.logdir_root)

    fig = plt.figure()
    n , bins, patches = plt.hist(lins, 50, density=1, facecolor="blue", alpha=0.75)
    fig.savefig("%s/hist_lin.png" % hp.logdir_root)