def synthesize(speaker, input_file, output_file, params): print("[Encoding]") from io_modules.dataset import Dataset from io_modules.dataset import Encodings from models.encoder import Encoder from trainers.encoder import Trainer encodings = Encodings() encodings.load('data/models/encoder.encodings') encoder = Encoder(params, encodings, runtime=True) encoder.load('data/models/rnn_encoder') seq = create_lab_input(input_file, speaker) mgc, att = encoder.generate(seq) _render_spectrogram(mgc, output_file + '.png') print("[Vocoding]") from models.vocoder import Vocoder from trainers.vocoder import Trainer vocoder = Vocoder(params, runtime=True) vocoder.load('data/models/rnn_vocoder') import time start = time.time() signal = vocoder.synthesize(mgc, batch_size=1000, temperature=params.temperature) stop = time.time() sys.stdout.write(" execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() from io_modules.dataset import DatasetIO dio = DatasetIO() enc = dio.b16_dec(signal, discreete=True) dio.write_wave(output_file, enc, params.target_sample_rate)
def load_encoder(params, base_path='data/models'): from io_modules.dataset import Encodings from models.encoder import Encoder encodings = Encodings() encodings.load('%s/encoder.encodings' % base_path) encoder = Encoder(params, encodings, runtime=True) encoder.load('%s/rnn_encoder' % base_path) return encoder
def phase_3_train_encoder(params): from io_modules.dataset import Dataset from io_modules.dataset import Encodings from models.encoder import Encoder from trainers.encoder import Trainer trainset = Dataset("data/processed/train") devset = Dataset("data/processed/dev") sys.stdout.write('Found ' + str(len(trainset.files)) + ' training files and ' + str(len(devset.files)) + ' development files\n') encodings = Encodings() count = 0 if not params.resume: for train_file in trainset.files: count += 1 if count % 100 == 0: sys.stdout.write('\r' + str(count) + '/' + str(len(trainset.files)) + ' processed files') sys.stdout.flush() from io_modules.dataset import DatasetIO dio = DatasetIO() lab_list = dio.read_lab(train_file + ".lab") for entry in lab_list: encodings.update(entry) sys.stdout.write('\r' + str(count) + '/' + str(len(trainset.files)) + ' processed files\n') sys.stdout.write('Found ' + str(len(encodings.char2int)) + ' unique symbols, ' + str(len(encodings.context2int)) + ' unique features and ' + str(len(encodings.speaker2int)) + ' unique speakers\n') encodings.store('data/models/encoder.encodings') else: encodings.load('data/models/encoder.encodings') if params.resume: runtime = True # avoid ortonormal initialization else: runtime = False encoder = Encoder(params, encodings, runtime=runtime) if params.resume: sys.stdout.write('Resuming from previous checkpoint\n') encoder.load('data/models/rnn_encoder') if params.no_guided_attention: sys.stdout.write('Disabling guided attention\n') if params.no_bounds: sys.stdout.write( 'Using internal stopping condition for synthesis\n') trainer = Trainer(encoder, trainset, devset) trainer.start_training(10, 1000, params)
def synthesize(speaker, input_file, output_file, params): from models.vocoder import device print(device) print("[Encoding]") from io_modules.dataset import Dataset from io_modules.dataset import Encodings from models.encoder import Encoder from trainers.encoder import Trainer encodings = Encodings() encodings.load('data/models/encoder.encodings') encoder = Encoder(params, encodings, runtime=True) encoder.load('data/models/rnn_encoder') seq = create_lab_input(input_file, speaker) mgc, att = encoder.generate(seq) _render_spectrogram(mgc, output_file + '.png') print("[Vocoding]") from models.vocoder import ParallelVocoder from models.vocoder import Vocoder vocoder = Vocoder(params) vocoder.load('data/models/nn_vocoder') pvocoder = ParallelVocoder(params, vocoder=vocoder) pvocoder.load('data/models/pnn_vocoder') import time start = time.time() import torch with torch.no_grad(): signal = pvocoder.synthesize(mgc, batch_size=params.batch_size) stop = time.time() sys.stdout.write(" execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() from io_modules.dataset import DatasetIO dio = DatasetIO() dio.write_wave(output_file, signal / 32768.0, params.target_sample_rate, dtype=signal.dtype)
elif not params.txt_file: print("Input file is mandatory") elif not params.output_file: print("Output file is mandatory") memory = int(params.memory) # for compatibility we have to add this paramater params.learning_rate = 0.0001 dynet_config.set(mem=memory, random_seed=9) if params.gpu: dynet_config.set_gpu() if params.g2p is not None: from models.g2p import G2P from io_modules.encodings import Encodings g2p_encodings = Encodings() g2p_encodings.load(params.g2p + '.encodings') g2p = G2P(g2p_encodings) g2p.load(params.g2p + '-bestAcc.network') if exists(params.g2p + '.lexicon'): g2p.load_lexicon(params.g2p + '.lexicon') else: g2p = None synthesize(params.speaker, params.txt_file, params.output_file, params, g2p=g2p)