Ejemplo n.º 1
0
def synthesize(speaker, input_file, output_file, params):
    print("[Encoding]")
    from io_modules.dataset import Dataset
    from io_modules.dataset import Encodings
    from models.encoder import Encoder
    from trainers.encoder import Trainer
    encodings = Encodings()
    encodings.load('data/models/encoder.encodings')
    encoder = Encoder(params, encodings, runtime=True)
    encoder.load('data/models/rnn_encoder')

    seq = create_lab_input(input_file, speaker)
    mgc, att = encoder.generate(seq)
    _render_spectrogram(mgc, output_file + '.png')

    print("[Vocoding]")
    from models.vocoder import Vocoder
    from trainers.vocoder import Trainer
    vocoder = Vocoder(params, runtime=True)
    vocoder.load('data/models/rnn_vocoder')

    import time
    start = time.time()
    signal = vocoder.synthesize(mgc,
                                batch_size=1000,
                                temperature=params.temperature)
    stop = time.time()
    sys.stdout.write(" execution time=" + str(stop - start))
    sys.stdout.write('\n')
    sys.stdout.flush()
    from io_modules.dataset import DatasetIO
    dio = DatasetIO()
    enc = dio.b16_dec(signal, discreete=True)
    dio.write_wave(output_file, enc, params.target_sample_rate)
Ejemplo n.º 2
0
    def phase_3_train_encoder(params):
        from io_modules.dataset import Dataset
        from models.encoder import Encoder
        from trainers.encoder import Trainer
        trainset = Dataset("data/processed/train")
        devset = Dataset("data/processed/dev")
        sys.stdout.write('Found ' + str(len(trainset.files)) +
                         ' training files and ' + str(len(devset.files)) +
                         ' development files\n')

        character2int = {}
        for train_file in trainset.files:
            from io_modules.dataset import DatasetIO
            dio = DatasetIO()
            lab_list = dio.read_lab(train_file + ".txt")
            for entry in lab_list:
                if entry.phoneme not in character2int:
                    character2int[entry.phoneme] = len(character2int)
        sys.stdout.write('Found ' + str(len(character2int)) +
                         ' unique phonemes\n')

        f = open('data/models/encoder.chars', 'w')
        for char in character2int:
            f.write(
                char.encode('utf-8') + '\t' + str(character2int[char]) + '\n')
        f.close()

        encoder = Encoder(params, len(character2int), character2int)
        if params.resume:
            sys.stdout.write('Resuming from previous checkpoint\n')
            encoder.load('data/models/rnn_encoder')
        trainer = Trainer(encoder, trainset, devset)
        trainer.start_training(10, 1000)
Ejemplo n.º 3
0
def load_encoder(params, base_path='data/models'):
    from io_modules.dataset import Encodings
    from models.encoder import Encoder

    encodings = Encodings()
    encodings.load('%s/encoder.encodings' % base_path)

    encoder = Encoder(params, encodings, runtime=True)
    encoder.load('%s/rnn_encoder' % base_path)

    return encoder
Ejemplo n.º 4
0
    def phase_3_train_encoder(params):
        from io_modules.dataset import Dataset
        from io_modules.dataset import Encodings
        from models.encoder import Encoder
        from trainers.encoder import Trainer
        trainset = Dataset("data/processed/train")
        devset = Dataset("data/processed/dev")
        sys.stdout.write('Found ' + str(len(trainset.files)) +
                         ' training files and ' + str(len(devset.files)) +
                         ' development files\n')

        encodings = Encodings()
        count = 0
        if not params.resume:
            for train_file in trainset.files:
                count += 1
                if count % 100 == 0:
                    sys.stdout.write('\r' + str(count) + '/' +
                                     str(len(trainset.files)) +
                                     ' processed files')
                    sys.stdout.flush()
                from io_modules.dataset import DatasetIO
                dio = DatasetIO()
                lab_list = dio.read_lab(train_file + ".lab")
                for entry in lab_list:
                    encodings.update(entry)
            sys.stdout.write('\r' + str(count) + '/' +
                             str(len(trainset.files)) + ' processed files\n')
            sys.stdout.write('Found ' + str(len(encodings.char2int)) +
                             ' unique symbols, ' +
                             str(len(encodings.context2int)) +
                             ' unique features and ' +
                             str(len(encodings.speaker2int)) +
                             ' unique speakers\n')
            encodings.store('data/models/encoder.encodings')
        else:
            encodings.load('data/models/encoder.encodings')
        if params.resume:
            runtime = True  # avoid ortonormal initialization
        else:
            runtime = False
        encoder = Encoder(params, encodings, runtime=runtime)
        if params.resume:
            sys.stdout.write('Resuming from previous checkpoint\n')
            encoder.load('data/models/rnn_encoder')
        if params.no_guided_attention:
            sys.stdout.write('Disabling guided attention\n')
        if params.no_bounds:
            sys.stdout.write(
                'Using internal stopping condition for synthesis\n')
        trainer = Trainer(encoder, trainset, devset)
        trainer.start_training(10, 1000, params)
Ejemplo n.º 5
0
def synthesize(speaker, input_file, output_file, params):
    from models.vocoder import device
    print(device)
    print("[Encoding]")
    from io_modules.dataset import Dataset
    from io_modules.dataset import Encodings
    from models.encoder import Encoder
    from trainers.encoder import Trainer
    encodings = Encodings()
    encodings.load('data/models/encoder.encodings')
    encoder = Encoder(params, encodings, runtime=True)
    encoder.load('data/models/rnn_encoder')

    seq = create_lab_input(input_file, speaker)
    mgc, att = encoder.generate(seq)
    _render_spectrogram(mgc, output_file + '.png')

    print("[Vocoding]")
    from models.vocoder import ParallelVocoder
    from models.vocoder import Vocoder
    vocoder = Vocoder(params)
    vocoder.load('data/models/nn_vocoder')
    pvocoder = ParallelVocoder(params, vocoder=vocoder)
    pvocoder.load('data/models/pnn_vocoder')

    import time
    start = time.time()
    import torch
    with torch.no_grad():
        signal = pvocoder.synthesize(mgc, batch_size=params.batch_size)
    stop = time.time()
    sys.stdout.write(" execution time=" + str(stop - start))
    sys.stdout.write('\n')
    sys.stdout.flush()
    from io_modules.dataset import DatasetIO
    dio = DatasetIO()

    dio.write_wave(output_file, signal / 32768.0, params.target_sample_rate, dtype=signal.dtype)