Esempio n. 1
0
    def synth_devset(self, max_size=-1):
        sys.stdout.write('\tSynthesizing devset\n')
        file_index = 1
        for file in self.devset.files:
            sys.stdout.write("\t\t" + str(file_index) + "/" +
                             str(len(self.devset.files)) +
                             " processing file " + file)
            sys.stdout.flush()
            file_index += 1
            lab_file = file + ".lab"
            dio = DatasetIO()
            lab = dio.read_lab(lab_file)
            phones = lab  # [entry.phoneme for entry in lab]
            mgc_file = file + ".mgc.npy"
            mgc = np.load(mgc_file)
            import time
            start = time.time()
            style_probs = self.vocoder.compute_gold_style_probs(mgc)
            style_file = 'data/output/' + file[file.rfind('/') + 1:] + ".style"
            f = open(style_file, 'w')
            for value in style_probs.value():
                f.write(str(value) + ' ')
            f.write('\n')
            f.close()
            mgc, att = self.vocoder.generate(phones,
                                             max_size=max_size,
                                             style_probs=style_probs.npvalue())

            self.array2file(
                mgc, 'data/output/' + file[file.rfind('/') + 1:] + '.mgc')
            att = [a.value() for a in att]
            new_att = np.zeros((len(att), len(phones) + 2, 3), dtype=np.uint8)

            for ii in range(len(phones) + 2):
                for jj in range(len(att)):
                    val = np.clip(int(att[jj][ii] * 255), 0, 255)
                    new_att[jj, ii, 0] = val
                    new_att[jj, ii, 1] = val
                    new_att[jj, ii, 2] = val

            from PIL import Image
            img = Image.fromarray(new_att, 'RGB')
            img.save('data/output/' + file[file.rfind('/') + 1:] + 'att.png')

            output_file = 'data/output/' + file[file.rfind('/') + 1:] + '.png'
            bitmap = np.zeros((mgc.shape[1], mgc.shape[0], 3), dtype=np.uint8)
            for x in range(mgc.shape[0]):
                for y in range(mgc.shape[1]):
                    val = mgc[x, y]
                    color = np.clip(val * 255, 0, 255)
                    bitmap[mgc.shape[1] - y - 1,
                           x] = [color, color,
                                 color]  # bitmap[y, x] = [color, color, color]
            import scipy.misc as smp
            img = smp.toimage(bitmap)
            img.save(output_file)
            stop = time.time()
            sys.stdout.write(" execution time=" + str(stop - start))
            sys.stdout.write('\n')
            sys.stdout.flush()
Esempio n. 2
0
    def synth_devset(self,
                     batch_size,
                     target_sample_rate,
                     sample=True,
                     temperature=1.0):
        sys.stdout.write('\tSynthesizing devset\n')
        file_index = 1
        for file in self.devset.files[:5]:
            sys.stdout.write("\t\t" + str(file_index) + "/" +
                             str(len(self.devset.files)) +
                             " processing file " + file + "\n")
            sys.stdout.flush()
            file_index += 1
            mgc_file = file + ".mgc.npy"
            mgc = np.load(mgc_file)
            import time
            start = time.time()
            synth = self.vocoder.synthesize(mgc, batch_size)
            stop = time.time()
            sys.stdout.write(" execution time=" + str(stop - start))
            sys.stdout.write('\n')
            sys.stdout.flush()

            dio = DatasetIO()

            output_file = 'data/output/' + file[file.rfind('/') + 1:] + '.wav'
            dio.write_wave(output_file,
                           synth,
                           target_sample_rate,
                           dtype=np.int16)
Esempio n. 3
0
def synthesize(speaker, input_file, output_file, params):
    print("[Encoding]")
    from io_modules.dataset import Dataset
    from io_modules.dataset import Encodings
    from models.encoder import Encoder
    from trainers.encoder import Trainer
    encodings = Encodings()
    encodings.load('data/models/encoder.encodings')
    encoder = Encoder(params, encodings, runtime=True)
    encoder.load('data/models/rnn_encoder')

    seq = create_lab_input(input_file, speaker)
    mgc, att = encoder.generate(seq)
    _render_spectrogram(mgc, output_file + '.png')

    print("[Vocoding]")
    from models.vocoder import Vocoder
    from trainers.vocoder import Trainer
    vocoder = Vocoder(params, runtime=True)
    vocoder.load('data/models/rnn_vocoder')

    import time
    start = time.time()
    signal = vocoder.synthesize(mgc,
                                batch_size=1000,
                                temperature=params.temperature)
    stop = time.time()
    sys.stdout.write(" execution time=" + str(stop - start))
    sys.stdout.write('\n')
    sys.stdout.flush()
    from io_modules.dataset import DatasetIO
    dio = DatasetIO()
    enc = dio.b16_dec(signal, discreete=True)
    dio.write_wave(output_file, enc, params.target_sample_rate)
Esempio n. 4
0
    def phase_3_train_encoder(params):
        from io_modules.dataset import Dataset
        from models.encoder import Encoder
        from trainers.encoder import Trainer
        trainset = Dataset("data/processed/train")
        devset = Dataset("data/processed/dev")
        sys.stdout.write('Found ' + str(len(trainset.files)) +
                         ' training files and ' + str(len(devset.files)) +
                         ' development files\n')

        character2int = {}
        for train_file in trainset.files:
            from io_modules.dataset import DatasetIO
            dio = DatasetIO()
            lab_list = dio.read_lab(train_file + ".txt")
            for entry in lab_list:
                if entry.phoneme not in character2int:
                    character2int[entry.phoneme] = len(character2int)
        sys.stdout.write('Found ' + str(len(character2int)) +
                         ' unique phonemes\n')

        f = open('data/models/encoder.chars', 'w')
        for char in character2int:
            f.write(
                char.encode('utf-8') + '\t' + str(character2int[char]) + '\n')
        f.close()

        encoder = Encoder(params, len(character2int), character2int)
        if params.resume:
            sys.stdout.write('Resuming from previous checkpoint\n')
            encoder.load('data/models/rnn_encoder')
        trainer = Trainer(encoder, trainset, devset)
        trainer.start_training(10, 1000)
Esempio n. 5
0
def write_signal_to_file(signal, output_file, params):
    from io_modules.dataset import DatasetIO
    dio = DatasetIO()

    dio.write_wave(output_file,
                   signal,
                   params.target_sample_rate,
                   dtype=signal.dtype)
Esempio n. 6
0
    def start_training(self, itt_no_improve, batch_size, params):
        epoch = 1
        left_itt = itt_no_improve
        dio = DatasetIO()

        if params.no_bounds:
            max_mgc = -1
        else:
            max_mgc = 1000
        self.synth_devset(max_size=max_mgc)
        self.vocoder.store('data/models/rnn_encoder')
        # self.synth_devset(batch_size, target_sample_rate)
        while left_itt > 0:
            sys.stdout.write("Starting epoch " + str(epoch) + "\n")
            sys.stdout.write("Shuffling training data\n")
            from random import shuffle
            shuffle(self.trainset.files)
            file_index = 1
            total_loss = 0
            for file in self.trainset.files:
                sys.stdout.write("\t" + str(file_index) + "/" +
                                 str(len(self.trainset.files)) +
                                 " processing file " + file)
                sys.stdout.flush()

                mgc_file = file + ".mgc.npy"
                mgc = np.load(mgc_file)

                lab_file = file + ".lab"
                lab = dio.read_lab(lab_file)
                phones = lab

                file_index += 1

                import time
                start = time.time()
                if len(mgc) < 2000:
                    loss = self.vocoder.learn(
                        phones, mgc, guided_att=not params.no_guided_attention)
                else:
                    sys.stdout.write(' too long, skipping')
                    loss = 0
                total_loss += loss
                stop = time.time()
                sys.stdout.write(' avg loss=' + str(loss) +
                                 " execution time=" + str(stop - start))
                sys.stdout.write('\n')
                sys.stdout.flush()
                if file_index % 200 == 0:
                    self.synth_devset(max_size=max_mgc)
                    self.vocoder.store('data/models/rnn_encoder')

            self.synth_devset(max_size=max_mgc)
            self.vocoder.store('data/models/rnn_encoder')

            epoch += 1
Esempio n. 7
0
    def synth_devset(self, max_size=-1):
        if self.mean is None:
            self.mean = np.load('data/models/mean.npy')
            self.stdev = np.load('data/models/stdev.npy')
        sys.stdout.write('\tSynthesizing devset\n')
        file_index = 1
        for file in self.devset.files[:5]:
            sys.stdout.write("\t\t" + str(file_index) + "/" +
                             str(len(self.devset.files)) +
                             " processing file " + file)
            sys.stdout.flush()
            file_index += 1
            lab_file = file + ".txt"
            dio = DatasetIO()
            lab = dio.read_lab(lab_file)
            phones = [entry.phoneme for entry in lab]
            import time
            start = time.time()
            mgc, att = self.vocoder.generate(phones, max_size=max_size)
            mgc = self._denormalize(mgc, mean=self.mean, stdev=self.stdev)

            self.array2file(
                self._denormalize(mgc, mean=self.mean, stdev=self.stdev),
                'data/output/' + file[file.rfind('/') + 1:] + '.mgc')
            mgc = self._normalize(mgc, mean=self.mean, stdev=self.stdev)
            att = [a.value() for a in att]
            new_att = np.zeros((len(att), len(phones) + 2, 3), dtype=np.uint8)

            for ii in range(len(phones) + 2):
                for jj in range(len(att)):
                    val = np.clip(int(att[jj][ii] * 255), 0, 255)
                    new_att[jj, ii, 0] = val
                    new_att[jj, ii, 1] = val
                    new_att[jj, ii, 2] = val

            from PIL import Image
            img = Image.fromarray(new_att, 'RGB')
            img.save('data/output/' + file[file.rfind('/') + 1:] + 'att.png')

            output_file = 'data/output/' + file[file.rfind('/') + 1:] + '.png'
            bitmap = np.zeros((mgc.shape[1], mgc.shape[0], 3), dtype=np.uint8)
            for x in xrange(mgc.shape[0]):
                for y in xrange(mgc.shape[1]):
                    val = mgc[x, y]
                    color = np.clip(val * 255, 0, 255)
                    bitmap[y, x] = [color, color, color]
            import scipy.misc as smp
            img = smp.toimage(bitmap)
            img.save(output_file)
            stop = time.time()
            sys.stdout.write(" execution time=" + str(stop - start))
            sys.stdout.write('\n')
            sys.stdout.flush()
Esempio n. 8
0
    def phase_3_train_encoder(params):
        from io_modules.dataset import Dataset
        from io_modules.dataset import Encodings
        from models.encoder import Encoder
        from trainers.encoder import Trainer
        trainset = Dataset("data/processed/train")
        devset = Dataset("data/processed/dev")
        sys.stdout.write('Found ' + str(len(trainset.files)) +
                         ' training files and ' + str(len(devset.files)) +
                         ' development files\n')

        encodings = Encodings()
        count = 0
        if not params.resume:
            for train_file in trainset.files:
                count += 1
                if count % 100 == 0:
                    sys.stdout.write('\r' + str(count) + '/' +
                                     str(len(trainset.files)) +
                                     ' processed files')
                    sys.stdout.flush()
                from io_modules.dataset import DatasetIO
                dio = DatasetIO()
                lab_list = dio.read_lab(train_file + ".lab")
                for entry in lab_list:
                    encodings.update(entry)
            sys.stdout.write('\r' + str(count) + '/' +
                             str(len(trainset.files)) + ' processed files\n')
            sys.stdout.write('Found ' + str(len(encodings.char2int)) +
                             ' unique symbols, ' +
                             str(len(encodings.context2int)) +
                             ' unique features and ' +
                             str(len(encodings.speaker2int)) +
                             ' unique speakers\n')
            encodings.store('data/models/encoder.encodings')
        else:
            encodings.load('data/models/encoder.encodings')
        if params.resume:
            runtime = True  # avoid ortonormal initialization
        else:
            runtime = False
        encoder = Encoder(params, encodings, runtime=runtime)
        if params.resume:
            sys.stdout.write('Resuming from previous checkpoint\n')
            encoder.load('data/models/rnn_encoder')
        if params.no_guided_attention:
            sys.stdout.write('Disabling guided attention\n')
        if params.no_bounds:
            sys.stdout.write(
                'Using internal stopping condition for synthesis\n')
        trainer = Trainer(encoder, trainset, devset)
        trainer.start_training(10, 1000, params)
Esempio n. 9
0
    def start_training(self,
                       itt_no_improve,
                       batch_size,
                       target_sample_rate,
                       params=None):
        epoch = 1
        left_itt = itt_no_improve
        dio = DatasetIO()
        self._render_devset()
        sys.stdout.write("\n")
        # self.synth_devset(batch_size, target_sample_rate)
        self.vocoder.store(self.target_output_path)

        num_files = 0
        while left_itt > 0:
            sys.stdout.write("Starting epoch " + str(epoch) + "\n")
            sys.stdout.write("Shuffling training data\n")
            from random import shuffle
            shuffle(self.trainset.files)
            file_index = 1
            total_loss = 0
            for file in self.trainset.files:
                num_files += 1
                sys.stdout.write("\t" + str(file_index) + "/" +
                                 str(len(self.trainset.files)) +
                                 " processing file " + file + '\n')
                sys.stdout.flush()
                wav_file = file + ".orig.wav"
                mgc_file = file + ".mgc.npy"
                mgc = np.load(mgc_file)
                file_index += 1
                data, sample_rate = dio.read_wave(wav_file)
                # wave_disc = data * 32768
                wave_disc = np.array(data, dtype=np.float32)

                import time
                start = time.time()
                loss = self.vocoder.learn(wave_disc, mgc, batch_size)
                total_loss += loss
                stop = time.time()
                sys.stdout.write(' avg loss=' + str(loss) +
                                 " execution time=" + str(stop - start))
                sys.stdout.write('\n')
                sys.stdout.flush()
                if file_index % 5000 == 0:
                    self.vocoder.store(self.target_output_path)
                    self.synth_devset(batch_size, target_sample_rate)

            self.vocoder.store(self.target_output_path)
            self.synth_devset(batch_size, target_sample_rate)

            epoch += 1
Esempio n. 10
0
    def start_training(self, itt_no_improve, batch_size, target_sample_rate):
        epoch = 1
        left_itt = itt_no_improve
        dio = DatasetIO()
        self._render_devset()
        sys.stdout.write("\n")
        self.vocoder.store('data/models/rnn_vocoder')
        while left_itt > 0:
            sys.stdout.write("Starting epoch " + str(epoch) + "\n")
            sys.stdout.write("Shuffling training data\n")
            from random import shuffle
            shuffle(self.trainset.files)
            file_index = 1
            total_loss = 0
            for file in self.trainset.files:
                sys.stdout.write("\t" + str(file_index) + "/" +
                                 str(len(self.trainset.files)) +
                                 " processing file " + file)
                sys.stdout.flush()
                wav_file = file + ".orig.wav"
                mgc_file = file + ".mgc.npy"
                mgc = np.load(mgc_file)
                file_index += 1
                data, sample_rate = dio.read_wave(wav_file)
                if self.use_ulaw:
                    [wave_disc, ulaw_cont] = dio.ulaw_encode(data)
                else:
                    wave_disc = dio.b16_enc(data)
                import time
                start = time.time()
                loss = self.vocoder.learn(wave_disc, mgc, batch_size)
                total_loss += loss
                stop = time.time()
                sys.stdout.write(' avg loss=' + str(loss) +
                                 " execution time=" + str(stop - start))
                sys.stdout.write('\n')
                sys.stdout.flush()
                if file_index % 50 == 0:
                    self.synth_devset(batch_size, target_sample_rate)
                    self.vocoder.store('data/models/rnn_vocoder')

            self.synth_devset(batch_size, target_sample_rate)
            self.vocoder.store('data/models/rnn_vocoder')

            epoch += 1
Esempio n. 11
0
 def __init__(self, params, wavenet):
     self.params = params
     self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000)
     self.RECEPTIVE_SIZE = 512  # this means 16ms
     self.MGC_SIZE = params.mgc_order
     self.dio = DatasetIO()
     self.vocoder = MelVocoder()
     self.wavenet = wavenet
     self.network = ParallelVocoderNetwork(receptive_field=self.RECEPTIVE_SIZE, filter_size=64).to(device)
     self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
Esempio n. 12
0
    def __init__(self, params, model=None, runtime=False):
        self.params = params
        self.HIDDEN_SIZE = [1000, 1000]

        self.FFT_SIZE = 513
        self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000)
        self.FILTER_SIZE = 128

        self.sparse = False
        if model is None:
            self.model = dy.Model()
        else:
            self.model = model

        input_size = self.params.mgc_order  # self.UPSAMPLE_COUNT + self.params.mgc_order
        hidden_w = []
        hidden_b = []
        for layer_size in self.HIDDEN_SIZE:
            hidden_w.append(self.model.add_parameters((layer_size, input_size)))
            hidden_b.append(self.model.add_parameters((layer_size)))
            input_size = layer_size

        self.mlp_excitation = [hidden_w, hidden_b]

        input_size = self.params.mgc_order  # self.UPSAMPLE_COUNT + self.params.mgc_order
        hidden_w = []
        hidden_b = []
        for layer_size in self.HIDDEN_SIZE:
            hidden_w.append(self.model.add_parameters((layer_size, input_size)))
            hidden_b.append(self.model.add_parameters((layer_size)))
            input_size = layer_size

        self.mlp_filter = [hidden_w, hidden_b]

        input_size = self.params.mgc_order  # self.UPSAMPLE_COUNT + self.params.mgc_order
        hidden_w = []
        hidden_b = []
        for layer_size in self.HIDDEN_SIZE:
            hidden_w.append(self.model.add_parameters((layer_size, input_size)))
            hidden_b.append(self.model.add_parameters((layer_size)))
            input_size = layer_size

        self.mlp_vuv = [hidden_w, hidden_b]

        self.excitation_w = self.model.add_parameters((self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1, input_size))
        self.excitation_b = self.model.add_parameters((self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1))
        self.filter_w = self.model.add_parameters((self.FILTER_SIZE, input_size))
        self.filter_b = self.model.add_parameters((self.FILTER_SIZE))
        self.vuv_w = self.model.add_parameters((1, input_size))
        self.vuv_b = self.model.add_parameters((1))

        self.trainer = dy.AdamTrainer(self.model, alpha=params.learning_rate)
        self.dio = DatasetIO()
        self.vocoder = MelVocoder()
Esempio n. 13
0
    def __init__(self, params, model=None, runtime=False):
        self.params = params

        self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000)
        self.RECEPTIVE_SIZE = 512  # this means 16ms

        self.dio = DatasetIO()
        self.vocoder = MelVocoder()

        self.network = VocoderNetwork(receptive_field=self.RECEPTIVE_SIZE, filter_size=256).to(device)
        self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
Esempio n. 14
0
def synthesize(speaker, input_file, output_file, params):
    from models.vocoder import device
    print(device)
    print("[Encoding]")
    from io_modules.dataset import Dataset
    from io_modules.dataset import Encodings
    from models.encoder import Encoder
    from trainers.encoder import Trainer
    encodings = Encodings()
    encodings.load('data/models/encoder.encodings')
    encoder = Encoder(params, encodings, runtime=True)
    encoder.load('data/models/rnn_encoder')

    seq = create_lab_input(input_file, speaker)
    mgc, att = encoder.generate(seq)
    _render_spectrogram(mgc, output_file + '.png')

    print("[Vocoding]")
    from models.vocoder import ParallelVocoder
    from models.vocoder import Vocoder
    vocoder = Vocoder(params)
    vocoder.load('data/models/nn_vocoder')
    pvocoder = ParallelVocoder(params, vocoder=vocoder)
    pvocoder.load('data/models/pnn_vocoder')

    import time
    start = time.time()
    import torch
    with torch.no_grad():
        signal = pvocoder.synthesize(mgc, batch_size=params.batch_size)
    stop = time.time()
    sys.stdout.write(" execution time=" + str(stop - start))
    sys.stdout.write('\n')
    sys.stdout.flush()
    from io_modules.dataset import DatasetIO
    dio = DatasetIO()

    dio.write_wave(output_file, signal / 32768.0, params.target_sample_rate, dtype=signal.dtype)
Esempio n. 15
0
    def __init__(self, params, model=None, runtime=False):
        self.params = params
        self.HIDDEN_SIZE = [1000, 1000]

        self.FFT_SIZE = 513
        self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000)
        self.FILTER_SIZE = 128

        self.sparse = False
        self.dio = DatasetIO()
        self.vocoder = MelVocoder()

        self.network = VocoderNetwork(self.params.mgc_order,
                                      self.UPSAMPLE_COUNT).to(device)
        self.trainer = torch.optim.Adam(self.network.parameters(),
                                        lr=self.params.learning_rate)
Esempio n. 16
0
    def start_training(self,
                       itt_no_improve,
                       batch_size,
                       target_sample_rate,
                       params=None):
        epoch = 1
        left_itt = itt_no_improve
        dio = DatasetIO()
        self._render_devset()
        sys.stdout.write("\n")

        if self.vocoder.sparse:
            print("Setting sparsity at: " + str(params.sparsity_step) + "%")
            sparsity = params.sparsity_step
            self.vocoder.rnnFine.set_sparsity(float(sparsity) / 100)
            self.vocoder.rnnCoarse.set_sparsity(float(sparsity) / 100)

        if self.vocoder.sparse:
            self.vocoder.store('data/models/rnn_vocoder_sparse')
        else:
            self.vocoder.store('data/models/rnn_vocoder')

        num_files = 0

        while left_itt > 0:
            sys.stdout.write("Starting epoch " + str(epoch) + "\n")
            sys.stdout.write("Shuffling training data\n")
            from random import shuffle
            shuffle(self.trainset.files)
            file_index = 1
            total_loss = 0
            for file in self.trainset.files:
                num_files += 1

                if num_files == params.sparsity_increase:
                    sparsity += params.sparsity_step
                    num_files = 0
                    if sparsity <= params.sparsity_target:
                        print("Setting sparsity at " + str(sparsity) + "%")
                        self.vocoder.rnnFine.set_sparsity(
                            float(sparsity) / 100)
                        self.vocoder.rnnCoarse.set_sparsity(
                            float(sparsity) / 100)
                    else:
                        sparsity = params.sparsity_target

                sys.stdout.write("\t" + str(file_index) + "/" +
                                 str(len(self.trainset.files)) +
                                 " processing file " + file)
                sys.stdout.flush()
                wav_file = file + ".orig.wav"
                mgc_file = file + ".mgc.npy"
                mgc = np.load(mgc_file)
                file_index += 1
                data, sample_rate = dio.read_wave(wav_file)
                if self.use_ulaw:
                    [wave_disc, ulaw_cont] = dio.ulaw_encode(data)
                else:
                    wave_disc = dio.b16_enc(data)
                import time
                start = time.time()
                loss = self.vocoder.learn(wave_disc, mgc, batch_size)
                total_loss += loss
                stop = time.time()
                sys.stdout.write(' avg loss=' + str(loss) +
                                 " execution time=" + str(stop - start))
                sys.stdout.write('\n')
                sys.stdout.flush()
                if file_index % 50 == 0:
                    self.synth_devset(batch_size, target_sample_rate)
                    if self.vocoder.sparse:
                        self.vocoder.store('data/models/rnn_vocoder_sparse')
                    else:
                        self.vocoder.store('data/models/rnn_vocoder')

            self.synth_devset(batch_size, target_sample_rate)
            if self.vocoder.sparse:
                self.vocoder.store('data/models/rnn_vocoder_sparse')
            else:
                self.vocoder.store('data/models/rnn_vocoder')

            epoch += 1
Esempio n. 17
0
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)


def _normalize(mgc, mean, stdev):
    for x in xrange(mgc.shape[0]):
        mgc[x] = (mgc[x] - mean) / stdev
    return mgc


if __name__ == '__main__':
    # import dynet
    import cube_runtime
    import numpy as np

    cube_runtime.print_version()
    cube_runtime.load_vocoder('../data/models/rnn_vocoder')
    mean = np.load('../data/models/mean.npy')
    stdev = np.load('../data/models/stdev.npy')
    mgc = np.load('../data/processed/dev/anca_dcnews_0127.orig.mgc.npy')
    #mgc=np.zeros((390, 60), dtype=np.double)
    mgc = _normalize(mgc, mean, stdev)
    mgc = mgc.copy(order='C')
    x=cube_runtime.vocode(mgc, stdev, mean, 0.8)
    from io_modules.dataset import DatasetIO
    dio = DatasetIO()
    enc = dio.b16_to_float(x, discreete=True)
    output_file = 'test.wav'
    dio.write_wave(output_file, enc, 16000)
    print x;
Esempio n. 18
0
    def start_training(self, itt_no_improve, batch_size):
        epoch = 1
        left_itt = itt_no_improve
        dio = DatasetIO()

        sys.stdout.write("Computing mean and standard deviation for spectral parameters\n")
        file_index = 1
        mean = None
        stdev = None
        count = 0
        min_db = None
        max_db = None
        for file in self.trainset.files:
            sys.stdout.write("\r\tFile " + str(file_index) + "/" + str(len(self.trainset.files)))
            sys.stdout.flush()
            mgc_file = file + ".mgc.npy"
            mgc = np.load(mgc_file)
            if mean is None:
                mean = np.zeros((mgc.shape[1]))
                stdev = np.zeros((mgc.shape[1]))
            for frame in mgc:
                mean += frame
                max_val = frame[np.argmax(frame)]
                min_val = frame[np.argmin(frame)]

                if min_db is None or min_val < min_db:
                    min_db = min_val
                if max_db is None or max_val > max_db:
                    max_db = max_val
            count += mgc.shape[0]
            file_index += 1
        mean /= count
        file_index = 1

        for file in self.trainset.files:
            sys.stdout.write("\r\tFile " + str(file_index) + "/" + str(len(self.trainset.files)))
            sys.stdout.flush()
            mgc_file = file + ".mgc.npy"
            mgc = np.load(mgc_file)
            for frame in mgc:
                stdev += np.power((frame - mean), 2)
            file_index += 1

        stdev /= count
        stdev = np.sqrt(stdev)
        self.mean = mean
        self.stdev = stdev
        self.min_db = min_db
        self.max_db = max_db
        self._render_devset()
        sys.stdout.write("\n")
        print 'mean =', mean
        print 'stdev =', stdev
        print 'min_db =', min_db
        print 'max_db =', max_db
        self.synth_devset(batch_size)
        np.save('data/models/mean_encoder', self.mean)
        np.save('data/models/stdev_encoder', self.stdev)
        with open('data/models/min_max_encoder', 'w') as f:
            f.write(str(min_db) + ' ' + str(max_db) + '\n')
            f.close()
        self.vocoder.store('data/models/rnn_encoder')
        # self.synth_devset(batch_size, target_sample_rate)
        while left_itt > 0:
            sys.stdout.write("Starting epoch " + str(epoch) + "\n")
            sys.stdout.write("Shuffling training data\n")
            from random import shuffle
            shuffle(self.trainset.files)
            file_index = 1
            total_loss = 0
            for file in self.trainset.files:
                sys.stdout.write(
                    "\t" + str(file_index) + "/" + str(len(self.trainset.files)) + " processing file " + file)
                sys.stdout.flush()

                mgc_file = file + ".mgc.npy"
                mgc = np.load(mgc_file)

                lab_file = file + ".txt"
                lab = dio.read_lab(lab_file)
                phones = [entry.phoneme for entry in lab]
                # custom normalization - we are now using binary divergence
                mgc = self._normalize(mgc, mean, stdev)
                file_index += 1

                import time
                start = time.time()
                if len(mgc) < 1000:
                    loss = self.vocoder.learn(phones, mgc)
                else:
                    sys.stdout.write(' too long, skipping')
                    loss = 0
                total_loss += loss
                stop = time.time()
                sys.stdout.write(' avg loss=' + str(loss) + " execution time=" + str(stop - start))
                sys.stdout.write('\n')
                sys.stdout.flush()
                if file_index % 200 == 0:
                    self.synth_devset(batch_size)
                    self.vocoder.store('data/models/rnn_encoder')

            self.synth_devset(batch_size)
            self.vocoder.store('data/models/rnn_encoder')

            epoch += 1
Esempio n. 19
0
    def phase_1_prepare_corpus(params):
        from os import listdir
        from os.path import isfile, join
        from os.path import exists
        train_files_tmp = [
            f for f in listdir(params.train_folder)
            if isfile(join(params.train_folder, f))
        ]
        dev_files_tmp = [
            f for f in listdir(params.dev_folder)
            if isfile(join(params.dev_folder, f))
        ]

        sys.stdout.write("Scanning training files...")
        sys.stdout.flush()
        final_list = []
        for file in train_files_tmp:
            base_name = file[:-4]
            lab_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            if exists(join(params.train_folder, lab_name)) and exists(
                    join(params.train_folder, wav_name)):
                if base_name not in final_list:
                    final_list.append(base_name)

        train_files = final_list
        sys.stdout.write(" found " + str(len(train_files)) +
                         " valid training files\n")
        sys.stdout.write("Scanning development files...")
        sys.stdout.flush()
        final_list = []
        for file in dev_files_tmp:
            base_name = file[:-4]
            lab_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            if exists(join(params.dev_folder, lab_name)) and exists(
                    join(params.dev_folder, wav_name)):
                if base_name not in final_list:
                    final_list.append(base_name)

        dev_files = final_list
        sys.stdout.write(" found " + str(len(dev_files)) +
                         " valid development files\n")

        from io_modules.dataset import DatasetIO
        from io_modules.vocoder import MelVocoder
        from shutil import copyfile
        import pysptk
        dio = DatasetIO()
        vocoder = MelVocoder()
        base_folder = params.train_folder
        for index in range(len(train_files)):
            sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" +
                             str(len(train_files)))
            sys.stdout.flush()
            base_name = train_files[index]
            txt_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            spc_name = base_name + '.png'
            lab_name = base_name + '.lab'

            # LAB - copy or create
            if exists(join(base_folder, lab_name)):
                copyfile(join(base_folder, lab_name),
                         join('data/processed/train', lab_name))
            else:
                create_lab_file(join(base_folder, txt_name),
                                join('data/processed/train', lab_name))
            # TXT
            copyfile(join(base_folder, txt_name),
                     join('data/processed/train', txt_name))
            # WAVE
            data, sample_rate = dio.read_wave(
                join(base_folder, wav_name),
                sample_rate=params.target_sample_rate)
            mgc = vocoder.melspectrogram(data,
                                         sample_rate=params.target_sample_rate,
                                         num_mels=params.mgc_order)
            # SPECT
            render_spectrogram(mgc, join('data/processed/train', spc_name))
            dio.write_wave(
                join('data/processed/train', base_name + '.orig.wav'), data,
                sample_rate)
            array2file(mgc, join('data/processed/train', base_name + '.mgc'))

        sys.stdout.write('\n')
        base_folder = params.dev_folder
        for index in range(len(dev_files)):
            sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" +
                             str(len(dev_files)))
            sys.stdout.flush()
            base_name = dev_files[index]
            txt_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            spc_name = base_name + '.png'
            lab_name = base_name + '.lab'

            # LAB - copy or create
            if exists(join(base_folder, lab_name)):
                copyfile(join(base_folder, lab_name),
                         join('data/processed/dev', lab_name))
            else:
                create_lab_file(join(base_folder, txt_name),
                                join('data/processed/dev', lab_name))
            # TXT
            copyfile(join(base_folder, txt_name),
                     join('data/processed/dev/', txt_name))
            # WAVE
            data, sample_rate = dio.read_wave(
                join(base_folder, wav_name),
                sample_rate=params.target_sample_rate)
            mgc = vocoder.melspectrogram(data,
                                         sample_rate=params.target_sample_rate,
                                         num_mels=params.mgc_order)
            # SPECT
            render_spectrogram(mgc, join('data/processed/dev', spc_name))
            dio.write_wave(join('data/processed/dev', base_name + '.orig.wav'),
                           data, sample_rate)
            array2file(mgc, join('data/processed/dev', base_name + '.mgc'))

        sys.stdout.write('\n')
Esempio n. 20
0
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)


def _normalize(mgc, mean, stdev):
    for x in xrange(mgc.shape[0]):
        mgc[x] = (mgc[x] - mean) / stdev
    return mgc


if __name__ == '__main__':
    # import dynet
    import cube_runtime
    from io_modules.dataset import DatasetIO
    import numpy as np

    cube_runtime.print_version()
    cube_runtime.load_vocoder('../data/models/rnn_vocoder_sparse')
    mgc = np.load('../test.mgc.npy')
    #mgc=np.zeros((390, 60), dtype=np.double)
    mgc = mgc.copy(order='C')
    x = cube_runtime.vocode(mgc, 0.8)
    dio = DatasetIO()
    #zz=
    #enc = dio.b16_to_float(x, discreete=True)
    enc = np.array(x, dtype='int16')

    output_file = 'test.wav'
    dio.write_wave(output_file, enc, 16000, dtype=np.int16)
    print(x)
Esempio n. 21
0
    def phase_1_prepare_corpus(params):
        from os import listdir
        from os.path import isfile, join
        from os.path import exists
        train_files_tmp = [
            f for f in listdir(params.train_folder)
            if isfile(join(params.train_folder, f))
        ]
        if params.dev_folder is not None:
            dev_files_tmp = [
                f for f in listdir(params.dev_folder)
                if isfile(join(params.dev_folder, f))
            ]
        else:
            dev_files_tmp = []

        if params.g2p is not None:
            from models.g2p import G2P
            from io_modules.encodings import Encodings
            g2p_encodings = Encodings()
            g2p_encodings.load(params.g2p + '.encodings')
            g2p = G2P(g2p_encodings)
            g2p.load(params.g2p + '-bestAcc.network')
            if exists(params.g2p + '.lexicon'):
                g2p.load_lexicon(params.g2p + '.lexicon')
        else:
            g2p = None

        sys.stdout.write("Scanning training files...")
        sys.stdout.flush()
        final_list = []
        for file in train_files_tmp:
            base_name = file[:-4]
            lab_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            if exists(join(params.train_folder, lab_name)) and exists(
                    join(params.train_folder, wav_name)):
                if base_name not in final_list:
                    final_list.append(base_name)

        train_files = final_list
        sys.stdout.write(" found " + str(len(train_files)) +
                         " valid training files\n")
        sys.stdout.write("Scanning development files...")
        sys.stdout.flush()
        final_list = []
        for file in dev_files_tmp:
            base_name = file[:-4]
            lab_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            if exists(join(params.dev_folder, lab_name)) and exists(
                    join(params.dev_folder, wav_name)):
                if base_name not in final_list:
                    final_list.append(base_name)

        dev_files = final_list
        sys.stdout.write(" found " + str(len(dev_files)) +
                         " valid development files\n")
        from io_modules.dataset import DatasetIO
        from io_modules.vocoder import MelVocoder
        from shutil import copyfile
        dio = DatasetIO()

        vocoder = MelVocoder()
        base_folder = params.train_folder
        total_files = 0
        for index in range(len(train_files)):
            total_files += 1
            sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" +
                             str(len(train_files)))
            sys.stdout.flush()
            base_name = train_files[index]
            txt_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            spc_name = base_name + '.png'
            lab_name = base_name + '.lab'

            tgt_txt_name = txt_name
            tgt_spc_name = spc_name
            tgt_lab_name = lab_name
            if params.prefix is not None:
                tgt_txt_name = params.prefix + "_{:05d}".format(
                    total_files) + '.txt'
                tgt_spc_name = params.prefix + "_{:05d}".format(
                    total_files) + '.png'
                tgt_lab_name = params.prefix + "_{:05d}".format(
                    total_files) + '.lab'

            # LAB - copy or create
            if exists(join(base_folder, lab_name)):
                copyfile(join(base_folder, lab_name),
                         join('data/processed/train', tgt_lab_name))
            else:
                create_lab_file(join(base_folder, txt_name),
                                join('data/processed/train', tgt_lab_name),
                                speaker_name=params.speaker,
                                g2p=g2p)
            # TXT
            copyfile(join(base_folder, txt_name),
                     join('data/processed/train', tgt_txt_name))
            # WAVE
            data, sample_rate = dio.read_wave(
                join(base_folder, wav_name),
                sample_rate=params.target_sample_rate)
            mgc = vocoder.melspectrogram(data,
                                         sample_rate=params.target_sample_rate,
                                         num_mels=params.mgc_order)
            # SPECT
            render_spectrogram(mgc, join('data/processed/train', tgt_spc_name))
            if params.prefix is None:
                dio.write_wave(
                    join('data/processed/train', base_name + '.orig.wav'),
                    data, sample_rate)
                array2file(mgc, join('data/processed/train',
                                     base_name + '.mgc'))
            else:
                tgt_wav_name = params.prefix + "_{:05d}".format(
                    total_files) + '.orig.wav'
                tgt_mgc_name = params.prefix + "_{:05d}".format(
                    total_files) + '.mgc'
                dio.write_wave(join('data/processed/train', tgt_wav_name),
                               data, sample_rate)
                array2file(mgc, join('data/processed/train', tgt_mgc_name))

        sys.stdout.write('\n')
        base_folder = params.dev_folder
        for index in range(len(dev_files)):
            total_files += 1
            sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" +
                             str(len(dev_files)))
            sys.stdout.flush()
            base_name = dev_files[index]
            txt_name = base_name + '.txt'
            wav_name = base_name + '.wav'
            spc_name = base_name + '.png'
            lab_name = base_name + '.lab'

            tgt_txt_name = txt_name
            tgt_spc_name = spc_name
            tgt_lab_name = lab_name
            if params.prefix is not None:
                tgt_txt_name = params.prefix + "_{:05d}".format(
                    total_files) + '.txt'
                tgt_spc_name = params.prefix + "_{:05d}".format(
                    total_files) + '.png'
                tgt_lab_name = params.prefix + "_{:05d}".format(
                    total_files) + '.lab'

            # LAB - copy or create
            if exists(join(base_folder, lab_name)):
                copyfile(join(base_folder, lab_name),
                         join('data/processed/dev', tgt_lab_name))
            else:
                create_lab_file(join(base_folder, txt_name),
                                join('data/processed/dev', tgt_lab_name),
                                speaker_name=params.speaker,
                                g2p=g2p)
            # TXT
            copyfile(join(base_folder, txt_name),
                     join('data/processed/dev', tgt_txt_name))
            # WAVE
            data, sample_rate = dio.read_wave(
                join(base_folder, wav_name),
                sample_rate=params.target_sample_rate)
            mgc = vocoder.melspectrogram(data,
                                         sample_rate=params.target_sample_rate,
                                         num_mels=params.mgc_order)
            # SPECT
            render_spectrogram(mgc, join('data/processed/dev', tgt_spc_name))
            if params.prefix is None:
                dio.write_wave(
                    join('data/processed/dev', base_name + '.orig.wav'), data,
                    sample_rate)
                array2file(mgc, join('data/processed/dev', base_name + '.mgc'))
            else:
                tgt_wav_name = params.prefix + "_{:05d}".format(
                    total_files) + '.orig.wav'
                tgt_mgc_name = params.prefix + "_{:05d}".format(
                    total_files) + '.mgc'
                dio.write_wave(join('data/processed/dev', tgt_wav_name), data,
                               sample_rate)
                array2file(mgc, join('data/processed/dev', tgt_mgc_name))

        sys.stdout.write('\n')