def test_init_e0_load(self): # Try epochs=0, loading existing model. hparams = self._get_hparams() hparams.use_gpu = True hparams.epochs = 0 hparams.out_dir = os.path.join( hparams.out_dir, "test_init_e0_load") # Add function name to path. hparams.model_type = None with unittest.mock.patch.object(ModelTrainer.logger, "warning") as mock_logger: trainer = self._get_trainer(hparams) mock_logger.assert_called_with( "No CUDA device available, use CPU mode instead.") target_dir = os.path.join(hparams.out_dir, hparams.networks_dir) makedirs_safe(target_dir) shutil.copyfile( os.path.join("integration", "fixtures", "test_model_in409_out67.nn"), os.path.join(target_dir, hparams.model_name)) trainer.init(hparams) self.assertIsNotNone(trainer.model_handler.model) shutil.rmtree(hparams.out_dir)
def _get_synth_dir(hparams: ExtendedHParams, use_model_name: bool = True, epoch: int = None, step: int = None) -> os.PathLike: if hparams.has_value("synth_dir"): save_dir = hparams.synth_dir else: if hparams.has_value("out_dir"): save_dir = [hparams.out_dir] else: save_dir = [os.path.curdir] if use_model_name and hparams.has_value("model_name"): save_dir.append(hparams.model_name) save_dir.append(Synthesiser.SYNTH_SUB_DIR) if epoch is not None: save_dir.append("e" + str(epoch)) elif step is not None: save_dir.append("s" + str(step)) save_dir = os.path.join(*save_dir) makedirs_safe(save_dir) logging.info("Selected {} as synthesis directory.".format(save_dir)) return save_dir
def synthesize(self, file_id_list, synth_output, hparams): # Create speaker subdirectories if necessary. for id_name in file_id_list: path_split = os.path.split(id_name) if len(path_split) > 2: makedirs_safe(os.path.join(hparams.synth_dir, *path_split[:-1])) if hparams.synth_vocoder == "WORLD": Synthesiser.run_world_synth(synth_output, hparams) # elif hparams.synth_vocoder == "STRAIGHT": # Add further vocoders here. elif hparams.synth_vocoder == "r9y9wavenet_mulaw_16k_world_feats_English": Synthesiser.run_r9y9wavenet_mulaw_world_feats_synth( synth_output, hparams) elif hparams.synth_vocoder == "raw": # The features in the synth_output dictionary are raw waveforms and can be written directly to the file. Synthesiser.run_raw_synth(synth_output, hparams) elif hparams.synth_vocoder == "80_SSRN_English_GL": # Use a pre-trained spectrogram super resolution network for English and Griffin-Lim. # The features in the synth_output should be mfbanks. raise NotImplementedError() # TODO elif hparams.synth_vocoder == "r9y9wavenet": # Synthesise with a pre-trained r9y9 WaveNet. The hyper-parameters have to match the model. Synthesiser.run_wavenet_vocoder(synth_output, hparams)
def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-w", "--dir_wav", help="Directory containing the wav files.", type=str, dest="dir_wav", required=True) parser.add_argument("-o", "--dir_out", help="Directory to save the trimmed files.", type=str, dest="dir_out", required=True) parser.add_argument("-f", "--file_id_list", help="Full path to file containing the ids.", type=str, dest="file_id_list", required=True) parser.add_argument("--format", help="Format of the audio file, e.g. WAV.", type=str, dest="format", required=False, default='wav') # Parse arguments args = parser.parse_args() # Read which files to process. with open(args.file_id_list) as f: id_list = f.readlines() # Trim entries in-place. id_list[:] = [s.strip(' \t\n\r') for s in id_list] # Create output directory if missing. makedirs_safe(args.dir_out) # Start silence removal. loudness_normalizer = SingleChannelNoiseReduction() loudness_normalizer.process_list(id_list, args.dir_wav, args.dir_out, args.format)
def synth(self, hparams, ids_input): """ Synthesise all given ids with the self.synthesize function. :param hparams: Hyper-parameter container. :param ids_input: Can be full path to file with ids, list of ids, or one id. :return: (Dictionary of network outputs, dictionary of post-processed (by self.OutputGen) network outputs) """ assert (self.model_handler is not None) # Check if trainer.init() was called before. assert ( hparams.synth_dir is not None ) # Directory to store the generated audio files has to be set. makedirs_safe(hparams.synth_dir) id_list = ModelTrainer._input_to_str_list(ids_input) self.logger.info("Start synthesising [{0}]".format(", ".join( str(i) for i in id_list))) t_start = timer() model_output, model_output_post = self._forward_batched( hparams, id_list, hparams.batch_size_synth, load_target=False, synth=True, benchmark=False, gen_figure=hparams.synth_gen_figure) t_training = timer() - t_start self.logger.info('Synthesis time for {} sample(s): {}'.format( len(id_list), timedelta(seconds=t_training))) return model_output, model_output_post
def save_to_file(self, filename): if self.plt is None: logging.error( "No generated plot exists, please run 'gen_plot()' first.") else: makedirs_safe(os.path.dirname(filename)) self.plt.savefig(filename, bbox_inches=0) logging.info("Figure saved as " + filename)
def test_train_exponential_decay(self): # logging.basicConfig(level=logging.INFO) hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_train_exponential_decay") # Add function name to path. hparams.epochs = 1 hparams.model_type = None target_dir = os.path.join(hparams.out_dir, hparams.networks_dir) makedirs_safe(target_dir) shutil.copyfile( os.path.join("integration", "fixtures", "test_model_in409_out67.nn"), os.path.join(target_dir, hparams.model_name)) # hparams.model_type = "RNNDYN-1_RELU_32-1_FC_{}".format(3 * hparams.num_coded_sps + 7) hparams.seed = 1234 hparams.optimiser_args["lr"] = 0.001 hparams.scheduler_type = "Exponential" hparams.scheduler_args["gamma"] = 0.9 trainer = self._get_trainer(hparams) trainer.init(hparams) for group in trainer.model_handler.optimiser.param_groups: group.setdefault('initial_lr', hparams.optimiser_args["lr"] ) # Add missing initial_lr to all groups. trainer.total_epoch = 10 # Artificially set the total number higher to compute the decay. trainer.train(hparams) expected_lr = hparams.optimiser_args["lr"] * hparams.scheduler_args[ "gamma"]**(11 * len(trainer.id_list_train)) self.assertEqual( expected_lr, trainer.model_handler.optimiser.param_groups[0]["lr"], "Exponential decay was not computed based on total number of epochs. " "Which should be the case when hparams.use_saved_learning_rate=True." ) # Try again with reset learning rate. trainer = self._get_trainer(hparams) hparams.use_saved_learning_rate = False trainer.init(hparams) for group in trainer.model_handler.optimiser.param_groups: group.setdefault('initial_lr', hparams.optimiser_args["lr"] ) # Add missing initial_lr to all groups. trainer.total_epoch = 10 # Artificially set the total number higher to compute the decay. trainer.train(hparams) expected_lr = hparams.optimiser_args["lr"] * hparams.scheduler_args[ "gamma"]**len(trainer.id_list_train) self.assertEqual( expected_lr, trainer.model_handler.optimiser.param_groups[0]["lr"], "Exponential decay was not reset for this training loop, " "which should be the case when hparam.use_saved_learning_rate=False." ) shutil.rmtree(hparams.out_dir)
def process_file(self, file, dir_audio, dir_out): raw, fs = soundfile.read(os.path.join(dir_audio, file)) raw = self.highpass_filter(raw, fs) out_file = os.path.join(dir_out, file) makedirs_safe(os.path.dirname(out_file)) soundfile.write(out_file, raw, samplerate=fs) return raw
def save_checkpoint(self, model_path: Union[str, os.PathLike], best_loss: np.ndarray = np.inf, epoch: int = None, step: int = None, save_as_best_model: bool = False, save_as_epoch: bool = True, save_as_last_model: bool = False, save_as_step: bool = True): assert save_as_best_model or save_as_last_model or step is not None \ or epoch is not None, "Epoch or step needs to be given." assert model_path is not None, "Given model_path cannot be None." if save_as_best_model: suffix = "best" elif save_as_last_model: suffix = "last" elif epoch is not None and save_as_epoch: suffix = "e{}".format(epoch) elif step is not None and save_as_step: suffix = "s{}".format(step) else: raise NotImplementedError() self.logger.info("Save {} checkpoint to {}.".format(suffix, model_path)) makedirs_safe(model_path) config_json = self.model.get_config_as_json() # TODO: Dump hparams in it as well? with open(os.path.join(model_path, "config.json"), "w") as f: f.write(config_json) params = self.model.state_dict() if self.ema: # Update only the parameters which are shadowed. params.update(self.ema.shadow) self.logger.info("Updated checkpoint with EMA model parameters {}." .format(", ".join(self.ema.shadow.keys()))) else: params = self.model.state_dict() checkpoint = {"params": params, "epoch": epoch, "step": step} torch.save(checkpoint, os.path.join(model_path, "params_" + suffix)) if self.optimiser is not None: opt_params = self.optimiser.state_dict() checkpoint = {"params": opt_params, "epoch": epoch, "step": step, "best_loss": best_loss} torch.save(checkpoint, os.path.join(model_path, "optimiser_" + suffix)) if self.scheduler is not None: scheduler_params = self.scheduler.state_dict() checkpoint = {"params": scheduler_params, "epoch": epoch, "step": step} torch.save(checkpoint, os.path.join(model_path, "scheduler_" + suffix))
def process_file(self, file, dir_audio, dir_out): raw, fs = soundfile.read(os.path.join(dir_audio, file)) raw -= raw.mean() raw *= math.sqrt(len(raw) * self.ref_rms**2 / (raw**2).sum()) out_file = os.path.join(dir_out, file) makedirs_safe(os.path.dirname(out_file)) soundfile.write(out_file, raw, samplerate=fs) return raw
def process_file(self, file, dir_audio, dir_out, silence_threshold_db=-50, hop_size_ms=None): raw, fs = soundfile.read(os.path.join(dir_audio, file)) frame_length = AudioProcessing.fs_to_frame_length(fs) if hop_size_ms is None: hop_size_ms = min(self.min_silence_ms, 32) _, indices = librosa.effects.trim(raw, top_db=abs(silence_threshold_db), frame_length=frame_length, hop_length=int(fs / 1000 * hop_size_ms)) trim_start = indices[0] / fs * 1000 trim_end = (len(raw) - indices[1]) / fs * 1000 # Add silence to the front if audio starts to early. if trim_start < self.min_silence_ms: # TODO: Find a robust way to create silence so that alignment still # works (maybe concat mirrored segments). logging.warning( "File {} has only {} ms of silence in the beginning.".format( file, trim_start)) trim_start = 0 else: trim_start -= self.min_silence_ms # Append silence if audio ends too late. if trim_end < self.min_silence_ms: # See TODO above. logging.warning( "File {} has only {} ms of silence in the end.".format( file, trim_end)) trim_end = 0 else: trim_end -= self.min_silence_ms start_frame = int(trim_start * fs / 1000) end_frame = int(-trim_end * fs / 1000 - 1) trimmed_raw = raw[start_frame:end_frame] out_file = os.path.join(dir_out, file) makedirs_safe(os.path.dirname(out_file)) soundfile.write(out_file, trimmed_raw, samplerate=fs) return trimmed_raw
def gen_data(dir_in, file_questions, dir_out=None, file_id_list="", id_list=None, return_dict=False): """ Generate question labels from HTK labels. :param dir_in: Directory containing the HTK labels. :param file_questions: Full file path to the question file. :param dir_out: Directory to store the question labels. If None, labels are not saved. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ... \\n uttIdN. If None, all file in audio_dir are used. :param return_dict: If true, returns an OrderedDict of all samples as first output. :return: Returns two normalisation parameters as tuple. If return_dict is True it returns all processed labels in an OrderedDict followed by the two normalisation parameters. """ # Fill file_id_list by .lab files in dir_in, if not given, and # set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.lab")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext( os.path.basename(file_id_list))[0] id_list = ['{}'.format(os.path.basename(element)) # Ignore full path. for element in id_list] if dir_out is not None: makedirs_safe(dir_out) label_operator = HTSLabelNormalisation(file_questions) if return_dict: label_dict, norm_params = label_operator.perform_normalisation( file_id_list_name, id_list, dir_in, dir_out, return_dict=True) return label_dict, norm_params[0], norm_params[1] else: norm_params = label_operator.perform_normalisation( file_id_list_name, id_list, dir_in, dir_out, return_dict=False) return norm_params[0], norm_params[1]
def process_file(self, file, dir_audio, dir_out): raw, fs = soundfile.read(os.path.join(dir_audio, file)) data_noisy_matlab = self.nparray_to_matlab(raw) data_noisy_matlab = self.eng.transpose(data_noisy_matlab) enhanced = self.eng.runme(data_noisy_matlab, fs) out_file = os.path.join(dir_out, file) makedirs_safe(os.path.dirname(out_file)) soundfile.write(out_file, enhanced, samplerate=fs) return enhanced
def test_init_load(self): # Try epochs=3, loading existing model. hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_init_load") # Add function name to path. hparams.model_type = None target_dir = os.path.join(hparams.out_dir, hparams.networks_dir) makedirs_safe(target_dir) shutil.copyfile( os.path.join("integration", "fixtures", "test_model_in409_out67.nn"), os.path.join(target_dir, hparams.model_name)) trainer = self._get_trainer(hparams) trainer.init(hparams) self.assertIsNotNone(trainer.model_handler.model) shutil.rmtree(hparams.out_dir)
def _save_to_npz(file_path: os.PathLike, features: np.ndarray, feature_name: str) -> None: makedirs_safe(os.path.dirname(file_path)) if not file_path.endswith(".npz"): file_path += ".npz" file_path_backup = file_path + "_bak" clean_backup_file = False if os.path.isfile(file_path): saved_features = dict(np.load(file_path)) os.rename(file_path, file_path_backup) clean_backup_file = True if feature_name in saved_features: logging.info("Overriding {} in {}.".format( feature_name, file_path)) saved_features[feature_name] = features else: saved_features = {feature_name: features} try: np.savez(file_path, **saved_features) except: if os.path.isfile(file_path_backup): logging.error("Error when writing {}, restoring backup".format( file_path)) if os.path.isfile(file_path): os.remove(file_path) os.rename(file_path_backup, file_path) clean_backup_file = False else: logging.error("Error when writing {}.".format(file_path)) raise if clean_backup_file: os.remove(file_path_backup)
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/" hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] id_list = ["p225/p225_051"] hparams.num_speakers = 1 t_benchmark = 0 for id_name in id_list: for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)): out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha) makedirs_safe(out_dir) sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat( batch_size, 1) # Copy data in batch dimension. alpha_vec = alpha_vec[:, None, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) mfcc_warped.sum().backward() t_benchmark += timer() - t_start assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all() ) # Compare results for cloned coded_sps within batch. if alpha == 0: assert ((mfcc_warped == coded_sps).all() ) # Compare results for no warping. sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre.tofile( os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark)))
def gen_data(self, dir_in, dir_out=None, file_id_list=None, id_list=None, return_dict=False): """ Prepare atom labels from wav files. If id_list is not None, only the ids listed there are generated, otherwise for each .wav file in the dir_in. Atoms are computed by the wcad algorithm. Examples with more than 70 atoms are rejected. One can create a new file_id_list by uncommenting the lines before the return statement. Nevertheless, the current file_id_list is not substituted by it. The algorithm also saves the extracted phrase component in dir_out/id_name.phrase, if dir_out is not None. :param dir_in: Directory containing the original wav files. :param dir_out: Directory where the labels are stored. If None, no labels are stored. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all wav files in audio_dir are used. :param return_dict: If True, returns an OrderedDict of all samples as first output. :return: Returns mean=0.0, std_dev, min, max of atoms. """ # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.wav")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext( os.path.basename(file_id_list))[0] if dir_out is not None: makedirs_safe(dir_out) if return_dict: label_dict = OrderedDict() mean_std_ext_atom = MeanStdDevExtractor() min_max_ext_atom = MinMaxExtractor() mean_std_ext_phrase = MeanStdDevExtractor() min_max_ext_phrase = MinMaxExtractor() # Compute Atoms. from wcad import WaveInput, PitchExtractor, MultiphraseExtractor, DictionaryGenerator, AtomExtrator, ModelCreator, ModelSaver, Params, Paths correct_utts = list() self.logger.info("Create atom labels for " + "[{0}]".format(", ".join(str(i) for i in id_list))) for id_name in id_list: self.logger.debug("Create atom labels for " + id_name) # Wcad has to be called in its root directory, therefore a change dir operation is necessary. cwd = os.getcwd() os.chdir(self.wcad_root) args = [dir_in + "/" + id_name + ".wav", dir_out] print(args) params = Params() # Overwrite the possible theta values by selected values. params.local_atoms_thetas = self.theta_interval params.k = [self.k] # params.min_atom_amp = 0.1 paths = Paths(args, params) # Start the extraction process. start_t = time.time() waveform = WaveInput(paths.wav, params).read() pitch = PitchExtractor(waveform, params, paths).compute() # Compute the phrase component. phrase = MultiphraseExtractor(pitch, waveform, params, paths).compute() phrase_curve = phrase.curve # Extract atroms. dictionary = DictionaryGenerator(params, paths).compute() atoms = AtomExtrator(waveform, pitch, phrase, dictionary, params, paths).compute() # Create a model. model = ModelCreator(phrase, atoms, pitch).compute() print(('Model created in %s seconds' % (time.time() - start_t))) # Save the atoms. ModelSaver(model, params, paths).save() os.chdir(cwd) # Check if output can be correct. possible_extraction_failure = False if len(atoms) < 50 and not any(a.amp > 10 for a in atoms): correct_utts.append(id_name) else: self.logger.warning("Possible fail of atom extractor for " + id_name + " (atoms: " + str(len(atoms)) + ", frames: " + str(len(phrase_curve)) + ", max: " + str(max(a.amp for a in atoms)) + ").") possible_extraction_failure = True atoms.sort(key=lambda x: x.position) # print_atoms(atoms) # Get audio length needed to trim the atoms. duration = self.get_audio_length(id_name, dir_in, self.frame_size_ms) # The algorithm generates a few atoms at negative positions, # pad them into the first atom at positive position. padded_amp = 0 padded_theta = 0 for idx, atom in enumerate(atoms): if atom.position < 0: padded_amp += atom.amp padded_theta += atom.theta else: atoms[idx].amp += padded_amp # Pad the amplitude. atoms[idx].theta = (atoms[idx].theta + padded_theta) / (idx + 1) del atoms[:idx] # Remove the negative atoms from the list. break # print_atoms(atoms) # The algorithm might also generate a few atoms beyond the last label, # pad them into the last label. padded_amp = 0 padded_theta = 0 for idx, atom in reversed(list(enumerate(atoms))): if atom.position * self.frame_size_ms > duration: padded_amp += atom.amp padded_theta += atom.theta else: atoms[idx].amp += padded_amp atoms[idx].theta = (atoms[idx].theta + padded_theta) / (len(atoms) - idx) atoms = atoms[:-(len(atoms) - idx - 1) or None] # Remove atoms beyond last label. break # print_atoms(atoms) # Create a label for each frame (size of frame_size_ms) with amplitude and theta of contained atoms. np_atom_labels = AtomLabelGen.atoms_to_labels( atoms, self.theta_interval, int(duration / self.frame_size_ms)) np_atom_amps = np.sum(np_atom_labels, axis=1) if not possible_extraction_failure: # Only add successful extractions to mean and std_dev computation. mean_std_ext_atom.add_sample( np_atom_amps[np_atom_amps[:, 0] != 0.0] ) # Only compute std_dev from atoms. min_max_ext_atom.add_sample(np_atom_amps) # mean_std_ext_phrase.add_sample(phrase_curve) # min_max_ext_phrase.add_sample(phrase_curve) if return_dict: label_dict[id_name] = np_atom_labels if dir_out is not None: # Save phrase, because it might be used in synthesis. phrase_curve.astype('float32').tofile( os.path.join(dir_out, id_name + self.ext_phrase)) # Save atoms binary (float32). np_atom_labels.astype('float32').tofile( os.path.join(dir_out, id_name + self.ext_atoms)) # Create a readable version of the atom data. # np.savetxt(os.path.join(dir_out, id_name + self.ext_atoms + ".txt"), np_atom_labels) # Manually set mean of atoms to 0, otherwise frames without atom will have an amplitude. if mean_std_ext_atom.sum_length > 0: # Make sure at least one atom was added. mean_std_ext_atom.sum_frames[:] = 0.0 else: mean_std_ext_atom.sum_frames = np.zeros(np_atom_amps.shape[1:]) mean_std_ext_atom.sum_squared_frames = np.zeros( np_atom_amps.shape[1:]) mean_std_ext_atom.sum_squared_frames[ 1] = mean_std_ext_atom.sum_length * self.theta_interval[-1] mean_std_ext_atom.save(os.path.join(dir_out, file_id_list_name)) min_max_ext_atom.save(os.path.join(dir_out, file_id_list_name)) # mean_std_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase')) # min_max_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase')) mean_atoms, std_atoms = mean_std_ext_atom.get_params() min_atoms, max_atoms = min_max_ext_atom.get_params() # mean_phrase, std_phrase = mean_std_ext_phrase.get_params() # min_phrase, max_phrase = min_max_ext_atom.get_params() # Use this block to save the part of the file_id_list for which atom extraction was successful into a new file. if correct_utts: with open( os.path.join( os.path.dirname(dir_in), "wcad_" + os.path.basename(file_id_list_name) + ".txt"), 'w') as f: f.write('\n'.join(correct_utts) + '\n') if return_dict: # Return dict of labels for all utterances. return label_dict, \ mean_atoms, std_atoms, \ min_atoms, max_atoms # mean_phrase, std_phrase, \ # min_phrase, max_phrase else: return mean_atoms, std_atoms, \ min_atoms, max_atoms
def process_file(self, file, dir_audio, dir_out, silence_threshold_db=-50, hop_size_ms=None): # sound = AudioSegment.from_file(os.path.join(dir_audio, file), format=audio_format) # trim_start = self._detect_leading_silence(sound, silence_threshold_db, chunk_size_ms) # trim_end = self._detect_leading_silence(sound.reverse(), silence_threshold_db, chunk_size_ms) raw, fs = soundfile.read(os.path.join(dir_audio, file)) frame_length = WorldFeatLabelGen.fs_to_frame_length(fs) if hop_size_ms is None: hop_size_ms = min(self.min_silence_ms, 32) _, indices = librosa.effects.trim(raw, top_db=abs(silence_threshold_db), frame_length=frame_length, hop_length=int(fs / 1000 * hop_size_ms)) trim_start = indices[0] / fs * 1000 trim_end = (len(raw) - indices[1]) / fs * 1000 # Add silence to the front if audio starts to early. if trim_start < self.min_silence_ms: # TODO: Find a robust way to create silence so that HTK alignment still works (maybe concat mirrored segments). logging.warning( "File {} has only {} ms of silence in the beginning.".format( file, trim_start)) # AudioSegment.silent(duration=self.min_silence_ms-trim_start) # if trim_start > 0: # silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start] # sound = silence + sound # elif trim_end > 0: # silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end] # sound = silence + sound # else: # self.logger.warning("Cannot append silence to the front of " + file + ". No silence exists at front or end which can be copied.") trim_start = 0 else: trim_start -= self.min_silence_ms # Append silence if audio ends too late. if trim_end < self.min_silence_ms: logging.warning( "File {} has only {} ms of silence in the end.".format( file, trim_end)) # silence = AudioSegment.silent(duration=self.min_silence_ms-trim_end) # if trim_end > 0: # silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end] # sound = sound + silence # elif trim_start > 0: # silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start] # sound = sound + silence # else: # self.logger.warning("Cannot append silence to the end of " + file + ". No silence exists at front or end which can be copied.") trim_end = 0 else: trim_end -= self.min_silence_ms # Trim the sound. trimmed_raw = raw[int(trim_start * fs / 1000):int(-trim_end * fs / 1000 - 1)] # trimmed_sound = sound[trim_start:-trim_end-1] # Save trimmed sound to file. out_file = os.path.join(dir_out, file) makedirs_safe(os.path.dirname(out_file)) soundfile.write(out_file, trimmed_raw, samplerate=fs) return trimmed_raw
def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-w", "--dir_wav", help="Directory containing the wav files.", type=str, dest="dir_wav", required=True) parser.add_argument("-o", "--dir_out", help="Directory to save the trimmed files.", type=str, dest="dir_out", required=True) parser.add_argument("-f", "--file_id_list", help="Full path to file containing the ids.", type=str, dest="file_id_list", required=True) parser.add_argument("--format", help="Format of the audio file, e.g. WAV.", type=str, dest="format", required=False, default='wav') parser.add_argument( "--silence_db", help="Threshold until which a frame is considered to be silent.", type=int, dest="silence_threshold_db", required=False, default=-50) parser.add_argument( "--chunk_size", help="Size of the chunk (frame size) in ms on which db is computed.", type=int, dest="chunk_size_ms", required=False, default=10) parser.add_argument( "--min_silence_ms", help= "Milliseconds of silence which are always kept in front and back of audio file.", type=int, dest="min_silence_ms", required=False, default=200) # Parse arguments args = parser.parse_args() # Read which files to process. with open(args.file_id_list) as f: id_list = f.readlines() # Trim entries in-place. id_list[:] = [s.strip(' \t\n\r') for s in id_list] # Create output directory if missing. makedirs_safe(args.dir_out) # Start silence removal. silence_remover = SilenceRemover(args.min_silence_ms) silence_remover.process_list(id_list, args.dir_wav, args.dir_out, args.format, args.silence_threshold_db, args.chunk_size_ms)
def main(): """Create samples with artificial alpha for each phoneme.""" from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = sys.argv[1] hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 alpha_range = 0.2 num_phonemes = 70 num_random_alphas = 7 # num_random_alphas = 53 # Randomly pick alphas for each phoneme. np.random.seed(42) # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range) phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 * alpha_range) # hparams.num_questions = 505 hparams.num_questions = 609 # hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "WORLD_artificially_warped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") print( "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}." .format(hparams.voice, hparams.out_dir, hparams.num_questions, len(np.unique(phonemes_to_alpha_tensor)), alpha_range)) from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( os.path.join("experiments", hparams.voice, "WORLD"), os.path.join("experiments", hparams.voice, "questions"), "ignored", hparams.num_questions, hparams) hparams.num_speakers = 1 speaker = "p276" num_synth_files = 5 # Number of files to synthesise to check warping manually. sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) def _question_to_phoneme_index(questions): """Helper function to convert questions to their current phoneme index.""" if questions.shape[-1] == 505: # German question set. indices = np.arange(86, 347, 5, dtype=np.int) elif questions.shape[-1] == 425: # English radio question set. indices = np.arange(58, 107, dtype=np.int) elif questions.shape[-1] == 609: # English unilex question set. indices = np.arange(92, 162, dtype=np.int) else: raise NotImplementedError( "Unknown question set with {} questions.".format( questions.shape[-1])) return QuestionLabelGen.questions_to_phoneme_indices( questions, indices) # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f: with open( os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format( hparams.voice))) as f: id_list = f.readlines() id_list[:] = [s.strip(' \t\n\r') for s in id_list if speaker in s] # Trim line endings in-place. out_dir = hparams.out_dir makedirs_safe(out_dir) makedirs_safe(os.path.join(out_dir, "cmp_mgc" + str(hparams.num_coded_sps))) t_benchmark = 0 org_to_warped_mcd = 0.0 for idx, id_name in enumerate(id_list): sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions) questions = questions[:len(coded_sps)] phoneme_indices = _question_to_phoneme_index(questions) alpha_vec = phonemes_to_alpha_tensor[phoneme_indices % len(phonemes_to_alpha_tensor), None] coded_sps = coded_sps[:len(alpha_vec), None, ...] # Create a batch dimension. alpha_vec = alpha_vec[:, None, None] # Create a batch and feature dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) t_benchmark += timer() - t_start sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( sample[:, 0:hparams.num_coded_sps], sample_pre[:, 0:hparams.num_coded_sps]) # Save warped features. sample_pre.tofile( os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), os.path.basename(id_name + WorldFeatLabelGen.ext_deltas))) hparams.synth_dir = out_dir if idx < num_synth_files: # Only synthesize a few of samples. trainer.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}". format(len(id_list), timedelta(seconds=t_benchmark), org_to_warped_mcd / len(id_list))) # Copy normalisation files which are necessary for training. for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]: shutil.copyfile( os.path.join( gen_in.dir_labels, gen_in.dir_deltas, MeanCovarianceExtractor.file_name_appendix + feature + ".bin"), os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
def _get_test_dir(self): out_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), type(self).__name__) makedirs_safe(out_dir) return out_dir
def gen_data(dir_in, dir_out=None, file_id_list="", id_list=None, return_dict=False): """ Prepare durations from HTK labels (forced-aligned). Each numpy array has the dimension num_phonemes x PhonemeDurationLabelGen.num_states (default num_state=5). :param dir_in: Directory where the HTK label files are stored (usually named label_state_align). :param dir_out: Main directory where the labels and normalisation parameters are saved to. If None, labels are not saved. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all file in dir_in are used. :param return_dict: If true, returns an OrderedDict of all samples as first output. :return: Returns two normalisation parameters as tuple. If return_dict is True it returns all processed labels in an OrderedDict followed by the two normalisation parameters. """ # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.wav")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext( os.path.basename(file_id_list))[0] id_list = [ '{}'.format(os.path.basename(element)) for element in id_list ] # Ignore full path. # Create directories in dir_out if it is given. if dir_out is not None: makedirs_safe(dir_out) # Create the return dictionary if required. if return_dict: label_dict = OrderedDict() # Create normalisation computation units. norm_params_ext_dur = MeanStdDevExtractor() logging.info("Extract phoneme durations for " + "[{0}]".format(", ".join(str(i) for i in id_list))) for file_name in id_list: logging.debug("Extract phoneme durations from " + file_name) with open( os.path.join( dir_in, file_name + PhonemeDurationLabelGen.ext_phonemes), 'r') as f: htk_labels = [line.rstrip('\n').split()[:2] for line in f] timings = np.array( htk_labels, dtype=np.float32 ) / PhonemeDurationLabelGen.min_phoneme_length dur = timings[:, 1] - timings[:, 0] dur = dur.reshape( -1, PhonemeDurationLabelGen.num_states).astype(np.float32) if return_dict: label_dict[file_name] = dur if dir_out is not None: dur.tofile( os.path.join( dir_out, file_name + PhonemeDurationLabelGen.ext_durations)) # Add sample to normalisation computation unit. norm_params_ext_dur.add_sample(dur) # Save mean and std dev of all features. norm_params_ext_dur.save(os.path.join(dir_out, file_id_list_name)) # Get normalisation parameters. norm_first, norm_second = norm_params_ext_dur.get_params() if return_dict: # Return dict of labels for all utterances. return label_dict, norm_first, norm_second else: return norm_first, norm_second
def setUpClass(cls): cls.out_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), type(cls()).__name__) makedirs_safe(cls.out_dir) # Create class name directory.
def run_world_synth(synth_output, hparams): """Run the WORLD synthesize method.""" fft_size = pyworld.get_cheaptrick_fft_size(hparams.synth_fs) save_dir = hparams.synth_dir if hparams.synth_dir is not None\ else hparams.out_dir if hparams.out_dir is not None\ else os.path.curdir for id_name, output in synth_output.items(): logging.info( "Synthesise {} with the WORLD vocoder.".format(id_name)) coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features( output, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) amp_sp = WorldFeatLabelGen.decode_sp( coded_sp, hparams.sp_type, hparams.synth_fs, post_filtering=hparams.do_post_filtering).astype(np.double, copy=False) args = dict() for attr in "preemphasize", "f0_silence_threshold", "lf0_zero": if hasattr(hparams, attr): args[attr] = getattr(hparams, attr) waveform = WorldFeatLabelGen.world_features_to_raw( amp_sp, lf0, vuv, bap, fs=hparams.synth_fs, n_fft=fft_size, **args) # f0 = np.exp(lf0, dtype=np.float64) # vuv[f0 < WorldFeatLabelGen.f0_silence_threshold] = 0 # WORLD throws an error for too small f0 values. # f0[vuv == 0] = 0.0 # ap = pyworld.decode_aperiodicity(np.ascontiguousarray(bap.reshape(-1, 1), np.float64), # hparams.synth_fs, # fft_size) # # waveform = pyworld.synthesize(f0, amp_sp, ap, hparams.synth_fs) # waveform = waveform.astype(np.float32, copy=False) # Does inplace conversion, if possible. # Always save as wav file first and convert afterwards if necessary. file_path = os.path.join( save_dir, "{}{}{}{}".format( os.path.basename(id_name), "_" + hparams.model_name if hparams.model_name is not None else "", hparams.synth_file_suffix, "_WORLD")) makedirs_safe(hparams.synth_dir) soundfile.write(file_path + ".wav", waveform, hparams.synth_fs) # Use PyDub for special audio formats. if hparams.synth_ext.lower() != 'wav': as_wave = pydub.AudioSegment.from_wav(file_path + ".wav") file = as_wave.export(file_path + "." + hparams.synth_ext, format=hparams.synth_ext) file.close() os.remove(file_path + ".wav")
def run_DM_AM(hparams, input_strings): """ A function for TTS with a pre-trained duration and acoustic model. :param hparams: Hyper-parameter container. The following parameters are used: front_end: Full path to the makeLabels.sh script in scripts/tts_frontend, depends on the language. festival_dir: Full path to the directory with the festival bin/ folder. front_end_accent (optional): Give an accent to the front_end, used in tts_frontend. duration_labels_dir: Full path to the folder containing the normalisation parameters used to train the duration model. file_symbol_dict: A file containing all the used phonemes (has been used to train the duration model, usually mono_phone.list). duration_model: Full path to the pre-trained duration model. num_phoneme_states: Number of states per phoneme, for each a duration is predicted by the duration model. question_file: Full path to question file used to train the acoustic model. question_labels_norm_file: Full path to normalisation file of questions used to train the acoustic model. num_questions: Number of questions which form the input dimension to the acoustic model. acoustic_model: Full path to acoustic model. :param input_strings: :return: """ # Create a temporary directory to store all files. with tempfile.TemporaryDirectory() as tmp_dir_name: # tmp_dir_name = os.path.realpath("TMP") # makedirs_safe(tmp_dir_name) hparams.out_dir = tmp_dir_name print("Created temporary directory", tmp_dir_name) id_list = ["synth" + str(idx) for idx in range(len(input_strings))] # Write the text to synthesise into a single synth.txt file with ids. utts_file = os.path.join(tmp_dir_name, "synth.txt") with open(utts_file, "w") as text_file: for idx, text in enumerate(input_strings): text_file.write("synth{}\t{}\n".format( idx, text)) # TODO: Remove parenthesis etc. # Call the front end on the synth.txt file. front_end_arguments = [ hparams.front_end, hparams.festival_dir, utts_file ] if hasattr(hparams, "front_end_accent" ) and hparams.front_end_accent is not None: front_end_arguments.append(hparams.front_end_accent) front_end_arguments.append(tmp_dir_name) subprocess.check_call(front_end_arguments) # Remove durations from mono labels. dir_mono_no_align = os.path.join(tmp_dir_name, "mono_no_align") dir_mono = os.path.join(tmp_dir_name, "labels", "mono") if os.path.isdir(dir_mono_no_align): shutil.rmtree(dir_mono_no_align) os.rename(dir_mono, dir_mono_no_align) for id_name in id_list: with open(os.path.join(dir_mono_no_align, id_name + ".lab"), "r") as f: old = f.read() monophones = old.split()[2::3] with open(os.path.join(dir_mono_no_align, id_name + ".lab"), "w") as f: f.write("\n".join(monophones)) # Run duration model. hparams.batch_size_test = len(input_strings) hparams.test_set_perc = 0.0 hparams.val_set_perc = 0.0 hparams.phoneme_label_type = "mono_no_align" hparams.output_norm_params_file_prefix = hparams.duration_norm_file_name if hasattr( hparams, "duration_norm_file_name") else None duration_model_trainer = DurationModelTrainer( os.path.join(tmp_dir_name, "mono_no_align"), hparams.duration_labels_dir, id_list, hparams.file_symbol_dict, hparams) assert hparams.duration_model is not None, "Path to duration model in hparams.duration_model is needed." hparams.model_path = hparams.duration_model hparams.model_name = os.path.basename(hparams.duration_model) # Predict durations. Durations are already converted to multiples of hparams.min_phoneme_length. hparams.load_from_checkpoint = True duration_model_trainer.init(hparams) _, output_dict_post = duration_model_trainer.forward( hparams, id_list) hparams.output_norm_params_file_prefix = None # Reset again. # Write duration to full labels. dir_full = os.path.join(tmp_dir_name, "labels", "full") dir_label_state_align = os.path.join(tmp_dir_name, "labels", "label_state_align") makedirs_safe(dir_label_state_align) for id_name in id_list: with open(os.path.join(dir_full, id_name + ".lab"), "r") as f: full = f.read().split()[2::3] with open( os.path.join(dir_label_state_align, id_name + ".lab"), "w") as f: current_time = 0 timings = output_dict_post[id_name] for idx, monophone in enumerate(full): for state in range(hparams.num_phoneme_states): next_time = current_time + int(timings[idx, state]) f.write("{}\t{}\t{}[{}]\n".format( current_time, next_time, monophone, state + 2)) current_time = next_time # Generate questions from HTK full labels. QuestionLabelGen.gen_data(dir_label_state_align, hparams.question_file, dir_out=tmp_dir_name, file_id_list="synth", id_list=id_list, return_dict=False) # Run acoustic model and synthesise. shutil.copy2(hparams.question_labels_norm_file, tmp_dir_name + "/min-max.bin" ) # Get normalisation parameters in same directory. acoustic_model_trainer = AcousticModelTrainer( hparams.world_features_dir, tmp_dir_name, id_list, hparams.num_questions, hparams) assert hparams.acoustic_model is not None, "Path to acoustic model in hparams.acoustic_model is needed." hparams.model_path = hparams.acoustic_model hparams.model_name = os.path.basename(hparams.acoustic_model) hparams.load_from_checkpoint = True acoustic_model_trainer.init(hparams) hparams.model_name = "" # No suffix in synthesised files. _, output_dict_post = acoustic_model_trainer.synth( hparams, id_list) logging.info("Synthesized files are in {}.".format( hparams.synth_dir)) return 0
dir_questions = "questions" dir_world = os.path.realpath("WORLD") thetas = np.arange(0.03, 0.155, 0.03) dir_atoms = "wcad-" + "_".join(map("{:.3f}".format, thetas)) if extract_features: # Generate labels. # # shutil.rmtree(dir_labels) # makedirs_safe(dir_labels) logging.warning("Label files are not recreated.") # TODO: Possible implementation at TTSModel.run_DM_AM(). # Generate durations logging.info("Create duration files.") shutil.rmtree(dir_dur) makedirs_safe(dir_dur) PhonemeDurationLabelGen.gen_data(dir_labels, dir_dur, id_list=id_list) # Generate questions. logging.info("Create question files.") shutil.rmtree(dir_questions) makedirs_safe(dir_questions) QuestionLabelGen.gen_data(dir_labels, "questions-en-radio_dnn_400.hed", dir_questions, id_list=id_list) # Generate WORLD features. logging.info("Create WORLD files.") shutil.rmtree(dir_world) makedirs_safe(dir_world)
def setUpClass(cls): hparams = cls._get_hparams(cls()) makedirs_safe(hparams.out_dir) # Create class name directory. # Load test data cls.id_list = cls._get_id_list()
def main(): logging.basicConfig(level=logging.INFO) hparams = VTLNTrainer.create_hparams() # TODO: Parse input for hparams. # General parameters. hparams.num_questions = 609 hparams.voice = "English" hparams.work_dir = os.path.realpath( os.path.join("experiments", hparams.voice)) hparams.data_dir = os.path.realpath("database") hparams.out_dir = os.path.join(hparams.work_dir, "VTLNModel") hparams.num_speakers = 33 hparams.speaker_emb_dim = 128 hparams.frame_size_ms = 5 hparams.seed = 1234 hparams.num_coded_sps = 30 hparams.add_deltas = True # Training parameters. hparams.epochs = 15 hparams.use_gpu = True hparams.train_pre_net = True hparams.dropout = 0.05 hparams.batch_size_train = 2 hparams.batch_size_val = hparams.batch_size_train hparams.batch_size_benchmark = hparams.batch_size_train hparams.grad_clip_norm_type = 2 hparams.grad_clip_max_norm = 1.0 hparams.use_saved_learning_rate = False hparams.optimiser_args["lr"] = 0.001 hparams.optimiser_type = "Adam" hparams.scheduler_type = "Plateau" hparams.scheduler_args["patience"] = 5 hparams.start_with_test = True hparams.epochs_per_checkpoint = 5 hparams.save_final_model = True hparams.use_best_as_final_model = True # hparams.model_type = None hparams.model_type = "VTLN" hparams.model_name = "VTLN-emb_all.nn" hparams.pre_net_model_name = "Bds-emb_all-dropout05-lr001.nn" hparams.pass_embs_to_pre_net = True hparams.f_get_emb_index = (vctk_utils.id_name_to_speaker_English, ) # Training. makedirs_safe(os.path.join(hparams.out_dir, "nn")) source_model_path = os.path.join(hparams.work_dir, "BaselineModel", "nn", hparams.pre_net_model_name) target_model_path = os.path.join(hparams.out_dir, "nn", hparams.pre_net_model_name) logging.info("Copy {} to {}.".format(source_model_path, target_model_path)) shutil.copyfile(source_model_path, target_model_path) trainer = VTLNTrainer(hparams) trainer.init(hparams) trainer.train(hparams) trainer.benchmark(hparams) # hparams.synth_gen_figure = False hparams.synth_vocoder = "WORLD" synth_list = dict() synth_list["train"] = ["p225/p225_010", "p226/p226_010", "p239/p239_010"] synth_list["val"] = ["p225/p225_051", "p226/p226_009", "p239/p239_066"] synth_list["test"] = ["p225/p225_033", "p226/p226_175", "p239/p239_056"] # with open(os.path.join(hparams.data_dir, "file_id_list_English_listening_test.txt" + sys.argv[1])) as f: # id_list_val = f.readlines() # synth_list["val"] = [s.strip(' \t\n\r') for s in id_list_val] # Trim line endings in-place. for key, value in synth_list.items(): hparams.synth_file_suffix = "_" + str(key) trainer.synth(hparams, synth_list[key])
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "AllPassWarpModelTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "VTLNArtificiallyWarped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "all_pass_warp_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps) hparams.alpha_ranges = [ 0.2, ] from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"] id_list = ["p225/p225_051"] t_benchmark = 0 for id_name in id_list: sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap, sp_type=hparams.sp_type) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)].copy() coded_sps = coded_sps[:, None, ...].repeat(batch_size, 1) # Copy data in batch dimension. for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)): out_dir = os.path.join(hparams.out_dir, "alpha_{0:0.2f}".format(alpha)) makedirs_safe(out_dir) alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha alpha_vec = alpha_vec[:, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() sp_warped, (_, nn_alpha) = all_pass_warp_model( torch.from_numpy(coded_sps.copy()), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.tensor(alpha_vec, requires_grad=True)) sp_warped.sum().backward() t_benchmark += timer() - t_start # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()) # Compare results for cloned coded_sps within batch. if np.isclose(alpha, 0): assert np.isclose( sp_warped.detach().cpu().numpy(), coded_sps).all() # Compare no warping results. sample_pre[:len(sp_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre, apply_mlpg=False) # Manually create samples without normalisation but with deltas. sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre_with_deltas).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre_with_deltas.tofile( os.path.join(out_dir, id_name + "." + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)) Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}, average: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark), timedelta(seconds=t_benchmark) / (len(id_list) * idx)))
def gen_data(self, dir_in, dir_out=None, file_id_list="", id_list=None, add_deltas=False, return_dict=False): """ Prepare LF0 and V/UV features from audio files. If add_delta is false each numpy array has the dimension num_frames x 2 [f0, vuv], otherwise the deltas and double deltas are added between the features resulting in num_frames x 4 [lf0(3*1), vuv]. :param dir_in: Directory where the .wav files are stored for each utterance to process. :param dir_out: Main directory where the labels and normalisation parameters are saved to subdirectories. If None, labels are not saved. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all file in audio_dir are used. :param add_deltas: Add deltas and double deltas to all features except vuv. :param return_dict: If true, returns an OrderedDict of all samples as first output. :return: Returns two normalisation parameters as tuple. If return_dict is True it returns all processed labels in an OrderedDict followed by the two normalisation parameters. """ # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.wav")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext(os.path.basename(file_id_list))[0] # Create directories in dir_out if it is given. if dir_out is not None: if add_deltas: makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_deltas)) else: makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_lf0)) makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_vuv)) # Create the return dictionary if required. if return_dict: label_dict = OrderedDict() # Create normalisation computation units. norm_params_ext_lf0 = MeanStdDevExtractor() # norm_params_ext_vuv = MeanStdDevExtractor() norm_params_ext_deltas = MeanStdDevExtractor() logging.info("Extract WORLD LF0 features for " + "[{0}]".format(", ".join(str(i) for i in id_list))) for file_name in id_list: logging.debug("Extract WORLD LF0 features from " + file_name) # Load audio file and extract features. audio_name = os.path.join(dir_in, file_name + ".wav") raw, fs = soundfile.read(audio_name) _f0, t = pyworld.dio(raw, fs) # Raw pitch extraction. TODO: Use magphase here? f0 = pyworld.stonemask(raw, _f0, t, fs) # Pitch refinement. # Compute lf0 and vuv information. lf0 = np.log(f0, dtype=np.float32) lf0[lf0 <= math.log(LF0LabelGen.f0_silence_threshold)] = LF0LabelGen.lf0_zero lf0, vuv = interpolate_lin(lf0) if add_deltas: # Compute the deltas and double deltas for all features. lf0_deltas, lf0_double_deltas = compute_deltas(lf0) # Combine them to a single feature sample. labels = np.concatenate((lf0, lf0_deltas, lf0_double_deltas, vuv), axis=1) # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = labels if dir_out is not None: labels.tofile(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_name + LF0LabelGen.ext_deltas)) # Add sample to normalisation computation unit. norm_params_ext_deltas.add_sample(labels) else: # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = np.concatenate((lf0, vuv), axis=1) if dir_out is not None: lf0.tofile(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_name + LF0LabelGen.ext_lf0)) vuv.astype(np.float32).tofile(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_name + LF0LabelGen.ext_vuv)) # Add sample to normalisation computation unit. norm_params_ext_lf0.add_sample(lf0) # norm_params_ext_vuv.add_sample(vuv) # Save mean and std dev of all features. if not add_deltas: norm_params_ext_lf0.save(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_id_list_name)) # norm_params_ext_vuv.save(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_id_list_name)) else: # Manually set vuv normalisation parameters before saving. norm_params_ext_deltas.sum_frames[-1] = 0.0 # Mean = 0.0 norm_params_ext_deltas.sum_squared_frames[-1] = norm_params_ext_deltas.sum_length # Variance = 1.0 norm_params_ext_deltas.save(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_id_list_name)) # Get normalisation parameters. if not add_deltas: norm_lf0 = norm_params_ext_lf0.get_params() # norm_vuv = norm_params_ext_vuv.get_params() norm_first = np.concatenate((norm_lf0[0], (0.0,)), axis=0) norm_second = np.concatenate((norm_lf0[1], (1.0,)), axis=0) else: norm_first, norm_second = norm_params_ext_deltas.get_params() if return_dict: # Return dict of labels for all utterances. return label_dict, norm_first, norm_second else: return norm_first, norm_second