def synth_devset(self, max_size=-1): sys.stdout.write('\tSynthesizing devset\n') file_index = 1 for file in self.devset.files: sys.stdout.write("\t\t" + str(file_index) + "/" + str(len(self.devset.files)) + " processing file " + file) sys.stdout.flush() file_index += 1 lab_file = file + ".lab" dio = DatasetIO() lab = dio.read_lab(lab_file) phones = lab # [entry.phoneme for entry in lab] mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) import time start = time.time() style_probs = self.vocoder.compute_gold_style_probs(mgc) style_file = 'data/output/' + file[file.rfind('/') + 1:] + ".style" f = open(style_file, 'w') for value in style_probs.value(): f.write(str(value) + ' ') f.write('\n') f.close() mgc, att = self.vocoder.generate(phones, max_size=max_size, style_probs=style_probs.npvalue()) self.array2file( mgc, 'data/output/' + file[file.rfind('/') + 1:] + '.mgc') att = [a.value() for a in att] new_att = np.zeros((len(att), len(phones) + 2, 3), dtype=np.uint8) for ii in range(len(phones) + 2): for jj in range(len(att)): val = np.clip(int(att[jj][ii] * 255), 0, 255) new_att[jj, ii, 0] = val new_att[jj, ii, 1] = val new_att[jj, ii, 2] = val from PIL import Image img = Image.fromarray(new_att, 'RGB') img.save('data/output/' + file[file.rfind('/') + 1:] + 'att.png') output_file = 'data/output/' + file[file.rfind('/') + 1:] + '.png' bitmap = np.zeros((mgc.shape[1], mgc.shape[0], 3), dtype=np.uint8) for x in range(mgc.shape[0]): for y in range(mgc.shape[1]): val = mgc[x, y] color = np.clip(val * 255, 0, 255) bitmap[mgc.shape[1] - y - 1, x] = [color, color, color] # bitmap[y, x] = [color, color, color] import scipy.misc as smp img = smp.toimage(bitmap) img.save(output_file) stop = time.time() sys.stdout.write(" execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush()
def synth_devset(self, batch_size, target_sample_rate, sample=True, temperature=1.0): sys.stdout.write('\tSynthesizing devset\n') file_index = 1 for file in self.devset.files[:5]: sys.stdout.write("\t\t" + str(file_index) + "/" + str(len(self.devset.files)) + " processing file " + file + "\n") sys.stdout.flush() file_index += 1 mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) import time start = time.time() synth = self.vocoder.synthesize(mgc, batch_size) stop = time.time() sys.stdout.write(" execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() dio = DatasetIO() output_file = 'data/output/' + file[file.rfind('/') + 1:] + '.wav' dio.write_wave(output_file, synth, target_sample_rate, dtype=np.int16)
def synthesize(speaker, input_file, output_file, params): print("[Encoding]") from io_modules.dataset import Dataset from io_modules.dataset import Encodings from models.encoder import Encoder from trainers.encoder import Trainer encodings = Encodings() encodings.load('data/models/encoder.encodings') encoder = Encoder(params, encodings, runtime=True) encoder.load('data/models/rnn_encoder') seq = create_lab_input(input_file, speaker) mgc, att = encoder.generate(seq) _render_spectrogram(mgc, output_file + '.png') print("[Vocoding]") from models.vocoder import Vocoder from trainers.vocoder import Trainer vocoder = Vocoder(params, runtime=True) vocoder.load('data/models/rnn_vocoder') import time start = time.time() signal = vocoder.synthesize(mgc, batch_size=1000, temperature=params.temperature) stop = time.time() sys.stdout.write(" execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() from io_modules.dataset import DatasetIO dio = DatasetIO() enc = dio.b16_dec(signal, discreete=True) dio.write_wave(output_file, enc, params.target_sample_rate)
def phase_3_train_encoder(params): from io_modules.dataset import Dataset from models.encoder import Encoder from trainers.encoder import Trainer trainset = Dataset("data/processed/train") devset = Dataset("data/processed/dev") sys.stdout.write('Found ' + str(len(trainset.files)) + ' training files and ' + str(len(devset.files)) + ' development files\n') character2int = {} for train_file in trainset.files: from io_modules.dataset import DatasetIO dio = DatasetIO() lab_list = dio.read_lab(train_file + ".txt") for entry in lab_list: if entry.phoneme not in character2int: character2int[entry.phoneme] = len(character2int) sys.stdout.write('Found ' + str(len(character2int)) + ' unique phonemes\n') f = open('data/models/encoder.chars', 'w') for char in character2int: f.write( char.encode('utf-8') + '\t' + str(character2int[char]) + '\n') f.close() encoder = Encoder(params, len(character2int), character2int) if params.resume: sys.stdout.write('Resuming from previous checkpoint\n') encoder.load('data/models/rnn_encoder') trainer = Trainer(encoder, trainset, devset) trainer.start_training(10, 1000)
def write_signal_to_file(signal, output_file, params): from io_modules.dataset import DatasetIO dio = DatasetIO() dio.write_wave(output_file, signal, params.target_sample_rate, dtype=signal.dtype)
def start_training(self, itt_no_improve, batch_size, params): epoch = 1 left_itt = itt_no_improve dio = DatasetIO() if params.no_bounds: max_mgc = -1 else: max_mgc = 1000 self.synth_devset(max_size=max_mgc) self.vocoder.store('data/models/rnn_encoder') # self.synth_devset(batch_size, target_sample_rate) while left_itt > 0: sys.stdout.write("Starting epoch " + str(epoch) + "\n") sys.stdout.write("Shuffling training data\n") from random import shuffle shuffle(self.trainset.files) file_index = 1 total_loss = 0 for file in self.trainset.files: sys.stdout.write("\t" + str(file_index) + "/" + str(len(self.trainset.files)) + " processing file " + file) sys.stdout.flush() mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) lab_file = file + ".lab" lab = dio.read_lab(lab_file) phones = lab file_index += 1 import time start = time.time() if len(mgc) < 2000: loss = self.vocoder.learn( phones, mgc, guided_att=not params.no_guided_attention) else: sys.stdout.write(' too long, skipping') loss = 0 total_loss += loss stop = time.time() sys.stdout.write(' avg loss=' + str(loss) + " execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() if file_index % 200 == 0: self.synth_devset(max_size=max_mgc) self.vocoder.store('data/models/rnn_encoder') self.synth_devset(max_size=max_mgc) self.vocoder.store('data/models/rnn_encoder') epoch += 1
def synth_devset(self, max_size=-1): if self.mean is None: self.mean = np.load('data/models/mean.npy') self.stdev = np.load('data/models/stdev.npy') sys.stdout.write('\tSynthesizing devset\n') file_index = 1 for file in self.devset.files[:5]: sys.stdout.write("\t\t" + str(file_index) + "/" + str(len(self.devset.files)) + " processing file " + file) sys.stdout.flush() file_index += 1 lab_file = file + ".txt" dio = DatasetIO() lab = dio.read_lab(lab_file) phones = [entry.phoneme for entry in lab] import time start = time.time() mgc, att = self.vocoder.generate(phones, max_size=max_size) mgc = self._denormalize(mgc, mean=self.mean, stdev=self.stdev) self.array2file( self._denormalize(mgc, mean=self.mean, stdev=self.stdev), 'data/output/' + file[file.rfind('/') + 1:] + '.mgc') mgc = self._normalize(mgc, mean=self.mean, stdev=self.stdev) att = [a.value() for a in att] new_att = np.zeros((len(att), len(phones) + 2, 3), dtype=np.uint8) for ii in range(len(phones) + 2): for jj in range(len(att)): val = np.clip(int(att[jj][ii] * 255), 0, 255) new_att[jj, ii, 0] = val new_att[jj, ii, 1] = val new_att[jj, ii, 2] = val from PIL import Image img = Image.fromarray(new_att, 'RGB') img.save('data/output/' + file[file.rfind('/') + 1:] + 'att.png') output_file = 'data/output/' + file[file.rfind('/') + 1:] + '.png' bitmap = np.zeros((mgc.shape[1], mgc.shape[0], 3), dtype=np.uint8) for x in xrange(mgc.shape[0]): for y in xrange(mgc.shape[1]): val = mgc[x, y] color = np.clip(val * 255, 0, 255) bitmap[y, x] = [color, color, color] import scipy.misc as smp img = smp.toimage(bitmap) img.save(output_file) stop = time.time() sys.stdout.write(" execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush()
def phase_3_train_encoder(params): from io_modules.dataset import Dataset from io_modules.dataset import Encodings from models.encoder import Encoder from trainers.encoder import Trainer trainset = Dataset("data/processed/train") devset = Dataset("data/processed/dev") sys.stdout.write('Found ' + str(len(trainset.files)) + ' training files and ' + str(len(devset.files)) + ' development files\n') encodings = Encodings() count = 0 if not params.resume: for train_file in trainset.files: count += 1 if count % 100 == 0: sys.stdout.write('\r' + str(count) + '/' + str(len(trainset.files)) + ' processed files') sys.stdout.flush() from io_modules.dataset import DatasetIO dio = DatasetIO() lab_list = dio.read_lab(train_file + ".lab") for entry in lab_list: encodings.update(entry) sys.stdout.write('\r' + str(count) + '/' + str(len(trainset.files)) + ' processed files\n') sys.stdout.write('Found ' + str(len(encodings.char2int)) + ' unique symbols, ' + str(len(encodings.context2int)) + ' unique features and ' + str(len(encodings.speaker2int)) + ' unique speakers\n') encodings.store('data/models/encoder.encodings') else: encodings.load('data/models/encoder.encodings') if params.resume: runtime = True # avoid ortonormal initialization else: runtime = False encoder = Encoder(params, encodings, runtime=runtime) if params.resume: sys.stdout.write('Resuming from previous checkpoint\n') encoder.load('data/models/rnn_encoder') if params.no_guided_attention: sys.stdout.write('Disabling guided attention\n') if params.no_bounds: sys.stdout.write( 'Using internal stopping condition for synthesis\n') trainer = Trainer(encoder, trainset, devset) trainer.start_training(10, 1000, params)
def start_training(self, itt_no_improve, batch_size, target_sample_rate, params=None): epoch = 1 left_itt = itt_no_improve dio = DatasetIO() self._render_devset() sys.stdout.write("\n") # self.synth_devset(batch_size, target_sample_rate) self.vocoder.store(self.target_output_path) num_files = 0 while left_itt > 0: sys.stdout.write("Starting epoch " + str(epoch) + "\n") sys.stdout.write("Shuffling training data\n") from random import shuffle shuffle(self.trainset.files) file_index = 1 total_loss = 0 for file in self.trainset.files: num_files += 1 sys.stdout.write("\t" + str(file_index) + "/" + str(len(self.trainset.files)) + " processing file " + file + '\n') sys.stdout.flush() wav_file = file + ".orig.wav" mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) file_index += 1 data, sample_rate = dio.read_wave(wav_file) # wave_disc = data * 32768 wave_disc = np.array(data, dtype=np.float32) import time start = time.time() loss = self.vocoder.learn(wave_disc, mgc, batch_size) total_loss += loss stop = time.time() sys.stdout.write(' avg loss=' + str(loss) + " execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() if file_index % 5000 == 0: self.vocoder.store(self.target_output_path) self.synth_devset(batch_size, target_sample_rate) self.vocoder.store(self.target_output_path) self.synth_devset(batch_size, target_sample_rate) epoch += 1
def start_training(self, itt_no_improve, batch_size, target_sample_rate): epoch = 1 left_itt = itt_no_improve dio = DatasetIO() self._render_devset() sys.stdout.write("\n") self.vocoder.store('data/models/rnn_vocoder') while left_itt > 0: sys.stdout.write("Starting epoch " + str(epoch) + "\n") sys.stdout.write("Shuffling training data\n") from random import shuffle shuffle(self.trainset.files) file_index = 1 total_loss = 0 for file in self.trainset.files: sys.stdout.write("\t" + str(file_index) + "/" + str(len(self.trainset.files)) + " processing file " + file) sys.stdout.flush() wav_file = file + ".orig.wav" mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) file_index += 1 data, sample_rate = dio.read_wave(wav_file) if self.use_ulaw: [wave_disc, ulaw_cont] = dio.ulaw_encode(data) else: wave_disc = dio.b16_enc(data) import time start = time.time() loss = self.vocoder.learn(wave_disc, mgc, batch_size) total_loss += loss stop = time.time() sys.stdout.write(' avg loss=' + str(loss) + " execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() if file_index % 50 == 0: self.synth_devset(batch_size, target_sample_rate) self.vocoder.store('data/models/rnn_vocoder') self.synth_devset(batch_size, target_sample_rate) self.vocoder.store('data/models/rnn_vocoder') epoch += 1
def __init__(self, params, wavenet): self.params = params self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.RECEPTIVE_SIZE = 512 # this means 16ms self.MGC_SIZE = params.mgc_order self.dio = DatasetIO() self.vocoder = MelVocoder() self.wavenet = wavenet self.network = ParallelVocoderNetwork(receptive_field=self.RECEPTIVE_SIZE, filter_size=64).to(device) self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
def __init__(self, params, model=None, runtime=False): self.params = params self.HIDDEN_SIZE = [1000, 1000] self.FFT_SIZE = 513 self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.FILTER_SIZE = 128 self.sparse = False if model is None: self.model = dy.Model() else: self.model = model input_size = self.params.mgc_order # self.UPSAMPLE_COUNT + self.params.mgc_order hidden_w = [] hidden_b = [] for layer_size in self.HIDDEN_SIZE: hidden_w.append(self.model.add_parameters((layer_size, input_size))) hidden_b.append(self.model.add_parameters((layer_size))) input_size = layer_size self.mlp_excitation = [hidden_w, hidden_b] input_size = self.params.mgc_order # self.UPSAMPLE_COUNT + self.params.mgc_order hidden_w = [] hidden_b = [] for layer_size in self.HIDDEN_SIZE: hidden_w.append(self.model.add_parameters((layer_size, input_size))) hidden_b.append(self.model.add_parameters((layer_size))) input_size = layer_size self.mlp_filter = [hidden_w, hidden_b] input_size = self.params.mgc_order # self.UPSAMPLE_COUNT + self.params.mgc_order hidden_w = [] hidden_b = [] for layer_size in self.HIDDEN_SIZE: hidden_w.append(self.model.add_parameters((layer_size, input_size))) hidden_b.append(self.model.add_parameters((layer_size))) input_size = layer_size self.mlp_vuv = [hidden_w, hidden_b] self.excitation_w = self.model.add_parameters((self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1, input_size)) self.excitation_b = self.model.add_parameters((self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1)) self.filter_w = self.model.add_parameters((self.FILTER_SIZE, input_size)) self.filter_b = self.model.add_parameters((self.FILTER_SIZE)) self.vuv_w = self.model.add_parameters((1, input_size)) self.vuv_b = self.model.add_parameters((1)) self.trainer = dy.AdamTrainer(self.model, alpha=params.learning_rate) self.dio = DatasetIO() self.vocoder = MelVocoder()
def __init__(self, params, model=None, runtime=False): self.params = params self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.RECEPTIVE_SIZE = 512 # this means 16ms self.dio = DatasetIO() self.vocoder = MelVocoder() self.network = VocoderNetwork(receptive_field=self.RECEPTIVE_SIZE, filter_size=256).to(device) self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
def synthesize(speaker, input_file, output_file, params): from models.vocoder import device print(device) print("[Encoding]") from io_modules.dataset import Dataset from io_modules.dataset import Encodings from models.encoder import Encoder from trainers.encoder import Trainer encodings = Encodings() encodings.load('data/models/encoder.encodings') encoder = Encoder(params, encodings, runtime=True) encoder.load('data/models/rnn_encoder') seq = create_lab_input(input_file, speaker) mgc, att = encoder.generate(seq) _render_spectrogram(mgc, output_file + '.png') print("[Vocoding]") from models.vocoder import ParallelVocoder from models.vocoder import Vocoder vocoder = Vocoder(params) vocoder.load('data/models/nn_vocoder') pvocoder = ParallelVocoder(params, vocoder=vocoder) pvocoder.load('data/models/pnn_vocoder') import time start = time.time() import torch with torch.no_grad(): signal = pvocoder.synthesize(mgc, batch_size=params.batch_size) stop = time.time() sys.stdout.write(" execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() from io_modules.dataset import DatasetIO dio = DatasetIO() dio.write_wave(output_file, signal / 32768.0, params.target_sample_rate, dtype=signal.dtype)
def __init__(self, params, model=None, runtime=False): self.params = params self.HIDDEN_SIZE = [1000, 1000] self.FFT_SIZE = 513 self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.FILTER_SIZE = 128 self.sparse = False self.dio = DatasetIO() self.vocoder = MelVocoder() self.network = VocoderNetwork(self.params.mgc_order, self.UPSAMPLE_COUNT).to(device) self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
def start_training(self, itt_no_improve, batch_size, target_sample_rate, params=None): epoch = 1 left_itt = itt_no_improve dio = DatasetIO() self._render_devset() sys.stdout.write("\n") if self.vocoder.sparse: print("Setting sparsity at: " + str(params.sparsity_step) + "%") sparsity = params.sparsity_step self.vocoder.rnnFine.set_sparsity(float(sparsity) / 100) self.vocoder.rnnCoarse.set_sparsity(float(sparsity) / 100) if self.vocoder.sparse: self.vocoder.store('data/models/rnn_vocoder_sparse') else: self.vocoder.store('data/models/rnn_vocoder') num_files = 0 while left_itt > 0: sys.stdout.write("Starting epoch " + str(epoch) + "\n") sys.stdout.write("Shuffling training data\n") from random import shuffle shuffle(self.trainset.files) file_index = 1 total_loss = 0 for file in self.trainset.files: num_files += 1 if num_files == params.sparsity_increase: sparsity += params.sparsity_step num_files = 0 if sparsity <= params.sparsity_target: print("Setting sparsity at " + str(sparsity) + "%") self.vocoder.rnnFine.set_sparsity( float(sparsity) / 100) self.vocoder.rnnCoarse.set_sparsity( float(sparsity) / 100) else: sparsity = params.sparsity_target sys.stdout.write("\t" + str(file_index) + "/" + str(len(self.trainset.files)) + " processing file " + file) sys.stdout.flush() wav_file = file + ".orig.wav" mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) file_index += 1 data, sample_rate = dio.read_wave(wav_file) if self.use_ulaw: [wave_disc, ulaw_cont] = dio.ulaw_encode(data) else: wave_disc = dio.b16_enc(data) import time start = time.time() loss = self.vocoder.learn(wave_disc, mgc, batch_size) total_loss += loss stop = time.time() sys.stdout.write(' avg loss=' + str(loss) + " execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() if file_index % 50 == 0: self.synth_devset(batch_size, target_sample_rate) if self.vocoder.sparse: self.vocoder.store('data/models/rnn_vocoder_sparse') else: self.vocoder.store('data/models/rnn_vocoder') self.synth_devset(batch_size, target_sample_rate) if self.vocoder.sparse: self.vocoder.store('data/models/rnn_vocoder_sparse') else: self.vocoder.store('data/models/rnn_vocoder') epoch += 1
parentdir = os.path.dirname(currentdir) sys.path.insert(0, parentdir) def _normalize(mgc, mean, stdev): for x in xrange(mgc.shape[0]): mgc[x] = (mgc[x] - mean) / stdev return mgc if __name__ == '__main__': # import dynet import cube_runtime import numpy as np cube_runtime.print_version() cube_runtime.load_vocoder('../data/models/rnn_vocoder') mean = np.load('../data/models/mean.npy') stdev = np.load('../data/models/stdev.npy') mgc = np.load('../data/processed/dev/anca_dcnews_0127.orig.mgc.npy') #mgc=np.zeros((390, 60), dtype=np.double) mgc = _normalize(mgc, mean, stdev) mgc = mgc.copy(order='C') x=cube_runtime.vocode(mgc, stdev, mean, 0.8) from io_modules.dataset import DatasetIO dio = DatasetIO() enc = dio.b16_to_float(x, discreete=True) output_file = 'test.wav' dio.write_wave(output_file, enc, 16000) print x;
def start_training(self, itt_no_improve, batch_size): epoch = 1 left_itt = itt_no_improve dio = DatasetIO() sys.stdout.write("Computing mean and standard deviation for spectral parameters\n") file_index = 1 mean = None stdev = None count = 0 min_db = None max_db = None for file in self.trainset.files: sys.stdout.write("\r\tFile " + str(file_index) + "/" + str(len(self.trainset.files))) sys.stdout.flush() mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) if mean is None: mean = np.zeros((mgc.shape[1])) stdev = np.zeros((mgc.shape[1])) for frame in mgc: mean += frame max_val = frame[np.argmax(frame)] min_val = frame[np.argmin(frame)] if min_db is None or min_val < min_db: min_db = min_val if max_db is None or max_val > max_db: max_db = max_val count += mgc.shape[0] file_index += 1 mean /= count file_index = 1 for file in self.trainset.files: sys.stdout.write("\r\tFile " + str(file_index) + "/" + str(len(self.trainset.files))) sys.stdout.flush() mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) for frame in mgc: stdev += np.power((frame - mean), 2) file_index += 1 stdev /= count stdev = np.sqrt(stdev) self.mean = mean self.stdev = stdev self.min_db = min_db self.max_db = max_db self._render_devset() sys.stdout.write("\n") print 'mean =', mean print 'stdev =', stdev print 'min_db =', min_db print 'max_db =', max_db self.synth_devset(batch_size) np.save('data/models/mean_encoder', self.mean) np.save('data/models/stdev_encoder', self.stdev) with open('data/models/min_max_encoder', 'w') as f: f.write(str(min_db) + ' ' + str(max_db) + '\n') f.close() self.vocoder.store('data/models/rnn_encoder') # self.synth_devset(batch_size, target_sample_rate) while left_itt > 0: sys.stdout.write("Starting epoch " + str(epoch) + "\n") sys.stdout.write("Shuffling training data\n") from random import shuffle shuffle(self.trainset.files) file_index = 1 total_loss = 0 for file in self.trainset.files: sys.stdout.write( "\t" + str(file_index) + "/" + str(len(self.trainset.files)) + " processing file " + file) sys.stdout.flush() mgc_file = file + ".mgc.npy" mgc = np.load(mgc_file) lab_file = file + ".txt" lab = dio.read_lab(lab_file) phones = [entry.phoneme for entry in lab] # custom normalization - we are now using binary divergence mgc = self._normalize(mgc, mean, stdev) file_index += 1 import time start = time.time() if len(mgc) < 1000: loss = self.vocoder.learn(phones, mgc) else: sys.stdout.write(' too long, skipping') loss = 0 total_loss += loss stop = time.time() sys.stdout.write(' avg loss=' + str(loss) + " execution time=" + str(stop - start)) sys.stdout.write('\n') sys.stdout.flush() if file_index % 200 == 0: self.synth_devset(batch_size) self.vocoder.store('data/models/rnn_encoder') self.synth_devset(batch_size) self.vocoder.store('data/models/rnn_encoder') epoch += 1
def phase_1_prepare_corpus(params): from os import listdir from os.path import isfile, join from os.path import exists train_files_tmp = [ f for f in listdir(params.train_folder) if isfile(join(params.train_folder, f)) ] dev_files_tmp = [ f for f in listdir(params.dev_folder) if isfile(join(params.dev_folder, f)) ] sys.stdout.write("Scanning training files...") sys.stdout.flush() final_list = [] for file in train_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.train_folder, lab_name)) and exists( join(params.train_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) train_files = final_list sys.stdout.write(" found " + str(len(train_files)) + " valid training files\n") sys.stdout.write("Scanning development files...") sys.stdout.flush() final_list = [] for file in dev_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.dev_folder, lab_name)) and exists( join(params.dev_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) dev_files = final_list sys.stdout.write(" found " + str(len(dev_files)) + " valid development files\n") from io_modules.dataset import DatasetIO from io_modules.vocoder import MelVocoder from shutil import copyfile import pysptk dio = DatasetIO() vocoder = MelVocoder() base_folder = params.train_folder for index in range(len(train_files)): sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(train_files))) sys.stdout.flush() base_name = train_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/train', lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/train', lab_name)) # TXT copyfile(join(base_folder, txt_name), join('data/processed/train', txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/train', spc_name)) dio.write_wave( join('data/processed/train', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/train', base_name + '.mgc')) sys.stdout.write('\n') base_folder = params.dev_folder for index in range(len(dev_files)): sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(dev_files))) sys.stdout.flush() base_name = dev_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/dev', lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/dev', lab_name)) # TXT copyfile(join(base_folder, txt_name), join('data/processed/dev/', txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/dev', spc_name)) dio.write_wave(join('data/processed/dev', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/dev', base_name + '.mgc')) sys.stdout.write('\n')
parentdir = os.path.dirname(currentdir) sys.path.insert(0, parentdir) def _normalize(mgc, mean, stdev): for x in xrange(mgc.shape[0]): mgc[x] = (mgc[x] - mean) / stdev return mgc if __name__ == '__main__': # import dynet import cube_runtime from io_modules.dataset import DatasetIO import numpy as np cube_runtime.print_version() cube_runtime.load_vocoder('../data/models/rnn_vocoder_sparse') mgc = np.load('../test.mgc.npy') #mgc=np.zeros((390, 60), dtype=np.double) mgc = mgc.copy(order='C') x = cube_runtime.vocode(mgc, 0.8) dio = DatasetIO() #zz= #enc = dio.b16_to_float(x, discreete=True) enc = np.array(x, dtype='int16') output_file = 'test.wav' dio.write_wave(output_file, enc, 16000, dtype=np.int16) print(x)
def phase_1_prepare_corpus(params): from os import listdir from os.path import isfile, join from os.path import exists train_files_tmp = [ f for f in listdir(params.train_folder) if isfile(join(params.train_folder, f)) ] if params.dev_folder is not None: dev_files_tmp = [ f for f in listdir(params.dev_folder) if isfile(join(params.dev_folder, f)) ] else: dev_files_tmp = [] if params.g2p is not None: from models.g2p import G2P from io_modules.encodings import Encodings g2p_encodings = Encodings() g2p_encodings.load(params.g2p + '.encodings') g2p = G2P(g2p_encodings) g2p.load(params.g2p + '-bestAcc.network') if exists(params.g2p + '.lexicon'): g2p.load_lexicon(params.g2p + '.lexicon') else: g2p = None sys.stdout.write("Scanning training files...") sys.stdout.flush() final_list = [] for file in train_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.train_folder, lab_name)) and exists( join(params.train_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) train_files = final_list sys.stdout.write(" found " + str(len(train_files)) + " valid training files\n") sys.stdout.write("Scanning development files...") sys.stdout.flush() final_list = [] for file in dev_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.dev_folder, lab_name)) and exists( join(params.dev_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) dev_files = final_list sys.stdout.write(" found " + str(len(dev_files)) + " valid development files\n") from io_modules.dataset import DatasetIO from io_modules.vocoder import MelVocoder from shutil import copyfile dio = DatasetIO() vocoder = MelVocoder() base_folder = params.train_folder total_files = 0 for index in range(len(train_files)): total_files += 1 sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(train_files))) sys.stdout.flush() base_name = train_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' tgt_txt_name = txt_name tgt_spc_name = spc_name tgt_lab_name = lab_name if params.prefix is not None: tgt_txt_name = params.prefix + "_{:05d}".format( total_files) + '.txt' tgt_spc_name = params.prefix + "_{:05d}".format( total_files) + '.png' tgt_lab_name = params.prefix + "_{:05d}".format( total_files) + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/train', tgt_lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/train', tgt_lab_name), speaker_name=params.speaker, g2p=g2p) # TXT copyfile(join(base_folder, txt_name), join('data/processed/train', tgt_txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/train', tgt_spc_name)) if params.prefix is None: dio.write_wave( join('data/processed/train', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/train', base_name + '.mgc')) else: tgt_wav_name = params.prefix + "_{:05d}".format( total_files) + '.orig.wav' tgt_mgc_name = params.prefix + "_{:05d}".format( total_files) + '.mgc' dio.write_wave(join('data/processed/train', tgt_wav_name), data, sample_rate) array2file(mgc, join('data/processed/train', tgt_mgc_name)) sys.stdout.write('\n') base_folder = params.dev_folder for index in range(len(dev_files)): total_files += 1 sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(dev_files))) sys.stdout.flush() base_name = dev_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' tgt_txt_name = txt_name tgt_spc_name = spc_name tgt_lab_name = lab_name if params.prefix is not None: tgt_txt_name = params.prefix + "_{:05d}".format( total_files) + '.txt' tgt_spc_name = params.prefix + "_{:05d}".format( total_files) + '.png' tgt_lab_name = params.prefix + "_{:05d}".format( total_files) + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/dev', tgt_lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/dev', tgt_lab_name), speaker_name=params.speaker, g2p=g2p) # TXT copyfile(join(base_folder, txt_name), join('data/processed/dev', tgt_txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/dev', tgt_spc_name)) if params.prefix is None: dio.write_wave( join('data/processed/dev', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/dev', base_name + '.mgc')) else: tgt_wav_name = params.prefix + "_{:05d}".format( total_files) + '.orig.wav' tgt_mgc_name = params.prefix + "_{:05d}".format( total_files) + '.mgc' dio.write_wave(join('data/processed/dev', tgt_wav_name), data, sample_rate) array2file(mgc, join('data/processed/dev', tgt_mgc_name)) sys.stdout.write('\n')