def __init__(self, params, wavenet): self.params = params self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.RECEPTIVE_SIZE = 512 # this means 16ms self.MGC_SIZE = params.mgc_order self.dio = DatasetIO() self.vocoder = MelVocoder() self.wavenet = wavenet self.network = ParallelVocoderNetwork(receptive_field=self.RECEPTIVE_SIZE, filter_size=64).to(device) self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
def __init__(self, params, model=None, runtime=False): self.params = params self.HIDDEN_SIZE = [1000, 1000] self.FFT_SIZE = 513 self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.FILTER_SIZE = 128 self.sparse = False if model is None: self.model = dy.Model() else: self.model = model input_size = self.params.mgc_order # self.UPSAMPLE_COUNT + self.params.mgc_order hidden_w = [] hidden_b = [] for layer_size in self.HIDDEN_SIZE: hidden_w.append(self.model.add_parameters((layer_size, input_size))) hidden_b.append(self.model.add_parameters((layer_size))) input_size = layer_size self.mlp_excitation = [hidden_w, hidden_b] input_size = self.params.mgc_order # self.UPSAMPLE_COUNT + self.params.mgc_order hidden_w = [] hidden_b = [] for layer_size in self.HIDDEN_SIZE: hidden_w.append(self.model.add_parameters((layer_size, input_size))) hidden_b.append(self.model.add_parameters((layer_size))) input_size = layer_size self.mlp_filter = [hidden_w, hidden_b] input_size = self.params.mgc_order # self.UPSAMPLE_COUNT + self.params.mgc_order hidden_w = [] hidden_b = [] for layer_size in self.HIDDEN_SIZE: hidden_w.append(self.model.add_parameters((layer_size, input_size))) hidden_b.append(self.model.add_parameters((layer_size))) input_size = layer_size self.mlp_vuv = [hidden_w, hidden_b] self.excitation_w = self.model.add_parameters((self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1, input_size)) self.excitation_b = self.model.add_parameters((self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1)) self.filter_w = self.model.add_parameters((self.FILTER_SIZE, input_size)) self.filter_b = self.model.add_parameters((self.FILTER_SIZE)) self.vuv_w = self.model.add_parameters((1, input_size)) self.vuv_b = self.model.add_parameters((1)) self.trainer = dy.AdamTrainer(self.model, alpha=params.learning_rate) self.dio = DatasetIO() self.vocoder = MelVocoder()
def __init__(self, params, model=None, runtime=False): self.params = params self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.RECEPTIVE_SIZE = 512 # this means 16ms self.dio = DatasetIO() self.vocoder = MelVocoder() self.network = VocoderNetwork(receptive_field=self.RECEPTIVE_SIZE, filter_size=256).to(device) self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
def __init__(self, params, model=None, runtime=False): self.params = params self.HIDDEN_SIZE = [1000, 1000] self.FFT_SIZE = 513 self.UPSAMPLE_COUNT = int(12.5 * params.target_sample_rate / 1000) self.FILTER_SIZE = 128 self.sparse = False self.dio = DatasetIO() self.vocoder = MelVocoder() self.network = VocoderNetwork(self.params.mgc_order, self.UPSAMPLE_COUNT).to(device) self.trainer = torch.optim.Adam(self.network.parameters(), lr=self.params.learning_rate)
def phase_1_prepare_corpus(params): from os import listdir from os.path import isfile, join from os.path import exists train_files_tmp = [ f for f in listdir(params.train_folder) if isfile(join(params.train_folder, f)) ] dev_files_tmp = [ f for f in listdir(params.dev_folder) if isfile(join(params.dev_folder, f)) ] sys.stdout.write("Scanning training files...") sys.stdout.flush() final_list = [] for file in train_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.train_folder, lab_name)) and exists( join(params.train_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) train_files = final_list sys.stdout.write(" found " + str(len(train_files)) + " valid training files\n") sys.stdout.write("Scanning development files...") sys.stdout.flush() final_list = [] for file in dev_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.dev_folder, lab_name)) and exists( join(params.dev_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) dev_files = final_list sys.stdout.write(" found " + str(len(dev_files)) + " valid development files\n") from io_modules.dataset import DatasetIO from io_modules.vocoder import MelVocoder from shutil import copyfile import pysptk dio = DatasetIO() vocoder = MelVocoder() base_folder = params.train_folder for index in range(len(train_files)): sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(train_files))) sys.stdout.flush() base_name = train_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/train', lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/train', lab_name)) # TXT copyfile(join(base_folder, txt_name), join('data/processed/train', txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/train', spc_name)) dio.write_wave( join('data/processed/train', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/train', base_name + '.mgc')) sys.stdout.write('\n') base_folder = params.dev_folder for index in range(len(dev_files)): sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(dev_files))) sys.stdout.flush() base_name = dev_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/dev', lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/dev', lab_name)) # TXT copyfile(join(base_folder, txt_name), join('data/processed/dev/', txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/dev', spc_name)) dio.write_wave(join('data/processed/dev', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/dev', base_name + '.mgc')) sys.stdout.write('\n')
def phase_1_prepare_corpus(params): from os import listdir from os.path import isfile, join from os.path import exists train_files_tmp = [ f for f in listdir(params.train_folder) if isfile(join(params.train_folder, f)) ] if params.dev_folder is not None: dev_files_tmp = [ f for f in listdir(params.dev_folder) if isfile(join(params.dev_folder, f)) ] else: dev_files_tmp = [] if params.g2p is not None: from models.g2p import G2P from io_modules.encodings import Encodings g2p_encodings = Encodings() g2p_encodings.load(params.g2p + '.encodings') g2p = G2P(g2p_encodings) g2p.load(params.g2p + '-bestAcc.network') if exists(params.g2p + '.lexicon'): g2p.load_lexicon(params.g2p + '.lexicon') else: g2p = None sys.stdout.write("Scanning training files...") sys.stdout.flush() final_list = [] for file in train_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.train_folder, lab_name)) and exists( join(params.train_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) train_files = final_list sys.stdout.write(" found " + str(len(train_files)) + " valid training files\n") sys.stdout.write("Scanning development files...") sys.stdout.flush() final_list = [] for file in dev_files_tmp: base_name = file[:-4] lab_name = base_name + '.txt' wav_name = base_name + '.wav' if exists(join(params.dev_folder, lab_name)) and exists( join(params.dev_folder, wav_name)): if base_name not in final_list: final_list.append(base_name) dev_files = final_list sys.stdout.write(" found " + str(len(dev_files)) + " valid development files\n") from io_modules.dataset import DatasetIO from io_modules.vocoder import MelVocoder from shutil import copyfile dio = DatasetIO() vocoder = MelVocoder() base_folder = params.train_folder total_files = 0 for index in range(len(train_files)): total_files += 1 sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(train_files))) sys.stdout.flush() base_name = train_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' tgt_txt_name = txt_name tgt_spc_name = spc_name tgt_lab_name = lab_name if params.prefix is not None: tgt_txt_name = params.prefix + "_{:05d}".format( total_files) + '.txt' tgt_spc_name = params.prefix + "_{:05d}".format( total_files) + '.png' tgt_lab_name = params.prefix + "_{:05d}".format( total_files) + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/train', tgt_lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/train', tgt_lab_name), speaker_name=params.speaker, g2p=g2p) # TXT copyfile(join(base_folder, txt_name), join('data/processed/train', tgt_txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/train', tgt_spc_name)) if params.prefix is None: dio.write_wave( join('data/processed/train', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/train', base_name + '.mgc')) else: tgt_wav_name = params.prefix + "_{:05d}".format( total_files) + '.orig.wav' tgt_mgc_name = params.prefix + "_{:05d}".format( total_files) + '.mgc' dio.write_wave(join('data/processed/train', tgt_wav_name), data, sample_rate) array2file(mgc, join('data/processed/train', tgt_mgc_name)) sys.stdout.write('\n') base_folder = params.dev_folder for index in range(len(dev_files)): total_files += 1 sys.stdout.write("\r\tprocessing file " + str(index + 1) + "/" + str(len(dev_files))) sys.stdout.flush() base_name = dev_files[index] txt_name = base_name + '.txt' wav_name = base_name + '.wav' spc_name = base_name + '.png' lab_name = base_name + '.lab' tgt_txt_name = txt_name tgt_spc_name = spc_name tgt_lab_name = lab_name if params.prefix is not None: tgt_txt_name = params.prefix + "_{:05d}".format( total_files) + '.txt' tgt_spc_name = params.prefix + "_{:05d}".format( total_files) + '.png' tgt_lab_name = params.prefix + "_{:05d}".format( total_files) + '.lab' # LAB - copy or create if exists(join(base_folder, lab_name)): copyfile(join(base_folder, lab_name), join('data/processed/dev', tgt_lab_name)) else: create_lab_file(join(base_folder, txt_name), join('data/processed/dev', tgt_lab_name), speaker_name=params.speaker, g2p=g2p) # TXT copyfile(join(base_folder, txt_name), join('data/processed/dev', tgt_txt_name)) # WAVE data, sample_rate = dio.read_wave( join(base_folder, wav_name), sample_rate=params.target_sample_rate) mgc = vocoder.melspectrogram(data, sample_rate=params.target_sample_rate, num_mels=params.mgc_order) # SPECT render_spectrogram(mgc, join('data/processed/dev', tgt_spc_name)) if params.prefix is None: dio.write_wave( join('data/processed/dev', base_name + '.orig.wav'), data, sample_rate) array2file(mgc, join('data/processed/dev', base_name + '.mgc')) else: tgt_wav_name = params.prefix + "_{:05d}".format( total_files) + '.orig.wav' tgt_mgc_name = params.prefix + "_{:05d}".format( total_files) + '.mgc' dio.write_wave(join('data/processed/dev', tgt_wav_name), data, sample_rate) array2file(mgc, join('data/processed/dev', tgt_mgc_name)) sys.stdout.write('\n')