def synthesize(self, file_id_list, synth_output, hparams): # Create speaker subdirectories if necessary. for id_name in file_id_list: path_split = os.path.split(id_name) if len(path_split) > 2: makedirs_safe(os.path.join(hparams.synth_dir, *path_split[:-1])) if hparams.synth_vocoder == "WORLD": self.run_world_synth(synth_output, hparams) # elif hparams.synth_vocoder == "STRAIGHT": # Add further vocoders here. elif hparams.synth_vocoder == "r9y9wavenet_quantized_16k_world_feats_English": # If no path is given, use pre-trained model. if not hasattr(hparams, "synth_vocoder_path") or hparams.synth_vocoder_path is None: parent_dirs = os.path.realpath(__file__).split(os.sep) dir_itts = str.join(os.sep, parent_dirs[:parent_dirs.index(main_dir) + 1]) hparams.synth_vocoder_path = os.path.join(dir_itts, "misc", "pretrained", "r9y9wavenet_quantized_16k_world_feats_English.nn") # Default quantization is with mu=255. if not hasattr(hparams, "mu") or hparams.mu is None: hparams.mu = 255 org_frame_rate_output_Hz = hparams.frame_rate_output_Hz if hasattr(hparams, 'frame_rate_output_Hz') else None hparams.frame_rate_output_Hz = 16000 self.run_r9y9wavenet_quantized_16k_world_feats_synth(synth_output, hparams) hparams.frame_rate_output_Hz = org_frame_rate_output_Hz
def save_to_file(self, filename): if self.plt is None: logging.error( "No generated plot exists, please run 'gen_plot()' first.") else: makedirs_safe(os.path.dirname(filename)) self.plt.savefig(filename, bbox_inches=0) logging.info("Figure saved as " + filename)
def synth(self, hparams, ids_input): assert(hparams.synth_dir is not None) # Directory to store the generated audio files has to be set. makedirs_safe(hparams.synth_dir) id_list = ModelTrainer._input_to_str_list(ids_input) self.logger.info("Start synthesising [{0}]".format(", ".join(str(i) for i in id_list))) t_start = timer() model_output, model_output_post = self._forward_batched(hparams, id_list, hparams.batch_size_synth, load_target=False, synth=True, benchmark=False, gen_figure=hparams.synth_gen_figure) t_training = timer() - t_start self.logger.info('Synthesis time for {} sample(s): {}'.format(len(id_list), timedelta(seconds=t_training))) return model_output, model_output_post
def gen_data(dir_in, file_questions, dir_out=None, file_id_list=None, id_list=None, return_dict=False): """ Generate question labels from HTK labels. :param dir_in: Directory containing the HTK labels. :param file_questions: Full file path to the question file. :param dir_out: Directory to store the question labels. If None, labels are not saved. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets.a :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all file in audio_dir are used. :param return_dict: If true, returns an OrderedDict of all samples as first output. :return: Returns two normalisation parameters as tuple. If return_dict is True it returns all processed labels in an OrderedDict followed by the two normalisation parameters. """ # Fill file_id_list by .lab files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.lab")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext( os.path.basename(file_id_list))[0] id_list = [ '{}'.format(os.path.basename(element)) for element in id_list ] # Ignore full path. # Create directories in dir_out if it is given. if dir_out is not None: makedirs_safe(dir_out) # Get question generation class. label_operater = HTSLabelNormalisation(file_questions) if return_dict: label_dict, norm_params = label_operater.perform_normalisation( file_id_list_name, id_list, dir_in, dir_out, return_dict=True) # self.norm_params = (samples_min, samples_max) return label_dict, norm_params[0], norm_params[1] else: norm_params = label_operater.perform_normalisation( file_id_list_name, id_list, dir_in, dir_out, return_dict=False) return norm_params[0], norm_params[1]
def init(self, hparams): # Create and initialize model. self.logger.info("Create ModelHandler.") self.model_handler = ModelHandlerPyTorch(hparams) self.logger.info("Model handler ready.") self.logger.info("CPU memory: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e3) + " MB.") if hparams.use_gpu: self.logger.info("GPU memory: " + str(get_gpu_memory_map()) + " MB.") # Create the necessary directories. makedirs_safe(os.path.join(hparams.out_dir, hparams.networks_dir, hparams.checkpoints_dir)) # Create the default model path if not set. if hparams.model_dir is None: hparams.model_dir = os.path.join(hparams.out_dir, hparams.networks_dir) model_path = os.path.join(hparams.model_dir, hparams.model_name) if hparams.epochs <= 0: # Try to load the model. If it doesn't exist, create a new one and save it. # Return the loaded/created model, because no training was requested. try: self.model_handler.load_model(model_path, hparams.use_gpu, hparams.optimiser_args["lr"] if hparams.optimiser_args["lr"] is not None else hparams.learning_rate) except FileNotFoundError: if hparams.model_type is None: self.logger.error("Model does not exist at {} and you didn't give model_type to create a new one.".format(model_path)) raise # This will rethrow the last exception. else: self.logger.warning('Model does not exist at {}. Creating a new one instead and saving it.'.format(model_path)) dim_in, dim_out = self.dataset_train.get_dims() self.model_handler.create_model(hparams, dim_in, dim_out) self.model_handler.save_model(model_path) self.logger.info("Model ready.") return self.model_handler if hparams.model_type is None: self.model_handler.load_model(model_path, hparams.use_gpu, hparams.optimiser_args["lr"] if hparams.optimiser_args["lr"] is not None else hparams.learning_rate) else: dim_in, dim_out = self.dataset_train.get_dims() self.model_handler.create_model(hparams, dim_in, dim_out) self.logger.info("Model ready.")
def run_world_synth(self, synth_output, hparams): """Run the WORLD synthesize method.""" fft_size = pyworld.get_cheaptrick_fft_size(hparams.synth_fs) save_dir = hparams.synth_dir if hparams.synth_dir is not None else hparams.out_dir if hparams.out_dir is not None else os.path.curdir for id_name, output in synth_output.items(): logging.info("Synthesise {} with the WORLD vocoder.".format(id_name)) coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(output, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) ln_sp = pysptk.mgc2sp(np.ascontiguousarray(coded_sp, dtype=np.float64), alpha=WorldFeatLabelGen.mgc_alpha, gamma=0.0, fftlen=fft_size) # sp = np.exp(sp.real * 2.0) # sp.imag = sp.imag * 180.0 / np.pi sp = np.exp(ln_sp.real) sp = np.power(sp.real / 32768.0, 2) # sp = np.power(sp.real / 32768.0, 2) # sp = np.exp(np.power(sp.real, 2)) # sp = pyworld.decode_spectral_envelope(np.ascontiguousarray(coded_sp, np.float64), self.synth_fs, fft_size) # Cepstral version. f0 = np.exp(lf0, dtype=np.float64) vuv[f0 < WorldFeatLabelGen.f0_silence_threshold] = 0 # WORLD throws an error for too small f0 values. f0[vuv == 0] = 0.0 ap = pyworld.decode_aperiodicity(np.ascontiguousarray(bap.reshape(-1, 1), np.float64), hparams.synth_fs, fft_size) waveform = pyworld.synthesize(f0, sp, ap, hparams.synth_fs) waveform = waveform.astype(np.float32, copy=False) # Does inplace conversion, if possible. # Always save as wav file first and convert afterwards if necessary. wav_file_path = os.path.join(save_dir, "{}{}{}.wav".format(os.path.basename(id_name), "_" + hparams.model_name if hparams.model_name is not None else "", hparams.synth_file_suffix, "_WORLD", ".wav")) makedirs_safe(hparams.synth_dir) soundfile.write(wav_file_path, waveform, hparams.synth_fs) # Use PyDub for special audio formats. if hparams.synth_ext.lower() != 'wav': as_wave = pydub.AudioSegment.from_wav(wav_file_path) as_wave.export(os.path.join(hparams.synth_dir, id_name + "." + hparams.synth_ext), format=hparams.synth_ext) os.remove(wav_file_path)
parent_dirs[:parent_dirs.index("IdiapTTS") + 1]) sys.path.append( dir_itts ) # Adds the IdiapTTS folder to the path, required to work on grid. from misc.utils import makedirs_safe """ Function that down-samples a list of audio files. python down_sampling.py <dir_audio> <dir_out> <file_id_list> <target_sampling_rate> """ # Read which files to process. dir_audio = sys.argv[1] dir_out = sys.argv[2] file_id_list = sys.argv[3] target_sampling_rate = sys.argv[4] with open(file_id_list) as f: id_list = f.readlines() # Trim entries in-place. id_list[:] = [s.strip(' \t\n\r') for s in id_list] for file_id in id_list: full_path_in = os.path.join(dir_audio, file_id + ".wav") print("Downsample " + full_path_in) sound = AudioSegment.from_file(full_path_in) sound = sound.set_frame_rate(16000) full_path_out = os.path.join(dir_out, file_id + ".wav") makedirs_safe(os.path.dirname(full_path_out)) sound.export(full_path_out, format="wav")
def gen_data(self, dir_in, dir_out=None, file_id_list=None, id_list=None, return_dict=False): """ Prepare atom labels from wav files. If id_list is not None, only the ids listed there are generated, otherwise for each .wav file in the dir_in. Atoms are computed by the wcad algorithm. Examples with more than 70 atoms are rejected. One can create a new file_id_list by uncommenting the lines before the return statement. Nevertheless, the current file_id_list is not substituted by it. The algorithm also saves the extracted phrase component in dir_out/id_name.phrase, if dir_out is not None. :param dir_in: Directory containing the original wav files. :param dir_out: Directory where the labels are stored. If None, no labels are stored. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all wav files in audio_dir are used. :param return_dict: If True, returns an OrderedDict of all samples as first output. :return: Returns mean=0.0, std_dev, min, max of atoms. """ # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.wav")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext( os.path.basename(file_id_list))[0] if dir_out is not None: makedirs_safe(dir_out) if return_dict: label_dict = OrderedDict() mean_std_ext_atom = MeanStdDevExtractor() min_max_ext_atom = MinMaxExtractor() mean_std_ext_phrase = MeanStdDevExtractor() min_max_ext_phrase = MinMaxExtractor() # Compute Atoms. from wcad import WaveInput, PitchExtractor, MultiphraseExtractor, DictionaryGenerator, AtomExtrator, ModelCreator, ModelSaver, Params, Paths correct_utts = list() self.logger.info("Create atom labels for " + "[{0}]".format(", ".join(str(i) for i in id_list))) for id_name in id_list: self.logger.debug("Create atom labels for " + id_name) # Wcad has to be called in its root directory, therefore a change dir operation is necessary. cwd = os.getcwd() os.chdir(self.wcad_root) args = [dir_in + "/" + id_name + ".wav", dir_out] print(args) params = Params() # Overwrite the possible theta values by selected values. params.local_atoms_thetas = self.theta_interval params.k = [self.k] # params.min_atom_amp = 0.1 paths = Paths(args, params) # Start the extraction process. start_t = time.time() waveform = WaveInput(paths.wav, params).read() pitch = PitchExtractor(waveform, params, paths).compute() # Compute the phrase component. phrase = MultiphraseExtractor(pitch, waveform, params, paths).compute() phrase_curve = phrase.curve # Extract atroms. dictionary = DictionaryGenerator(params, paths).compute() atoms = AtomExtrator(waveform, pitch, phrase, dictionary, params, paths).compute() # Create a model. model = ModelCreator(phrase, atoms, pitch).compute() print(('Model created in %s seconds' % (time.time() - start_t))) # Save the atoms. ModelSaver(model, params, paths).save() os.chdir(cwd) # Check if output can be correct. possible_extraction_failure = False if len(atoms) < 50 and not any(a.amp > 10 for a in atoms): correct_utts.append(id_name) else: self.logger.warning("Possible fail of atom extractor for " + id_name + " (atoms: " + str(len(atoms)) + ", frames: " + str(len(phrase_curve)) + ", max: " + str(max(a.amp for a in atoms)) + ").") possible_extraction_failure = True atoms.sort(key=lambda x: x.position) # print_atoms(atoms) # Get audio length needed to trim the atoms. duration = self.get_audio_length(id_name, dir_in, self.frame_size_ms) # The algorithm generates a few atoms at negative positions, # pad them into the first atom at positive position. padded_amp = 0 padded_theta = 0 for idx, atom in enumerate(atoms): if atom.position < 0: padded_amp += atom.amp padded_theta += atom.theta else: atoms[idx].amp += padded_amp # Pad the amplitude. atoms[idx].theta = (atoms[idx].theta + padded_theta) / (idx + 1) del atoms[:idx] # Remove the negative atoms from the list. break # print_atoms(atoms) # The algorithm might also generate a few atoms beyond the last label, # pad them into the last label. padded_amp = 0 padded_theta = 0 for idx, atom in reversed(list(enumerate(atoms))): if atom.position * self.frame_size_ms > duration: padded_amp += atom.amp padded_theta += atom.theta else: atoms[idx].amp += padded_amp atoms[idx].theta = (atoms[idx].theta + padded_theta) / (len(atoms) - idx) atoms = atoms[:-(len(atoms) - idx - 1) or None] # Remove atoms beyond last label. break # print_atoms(atoms) # Create a label for each frame (size of frame_size_ms) with amplitude and theta of contained atoms. np_atom_labels = AtomLabelGen.atoms_to_labels( atoms, self.theta_interval, int(duration / self.frame_size_ms)) np_atom_amps = np.sum(np_atom_labels, axis=1) if not possible_extraction_failure: # Only add successful extractions to mean and std_dev computation. mean_std_ext_atom.add_sample( np_atom_amps[np_atom_amps[:, 0] != 0.0] ) # Only compute std_dev from atoms. min_max_ext_atom.add_sample(np_atom_amps) # mean_std_ext_phrase.add_sample(phrase_curve) # min_max_ext_phrase.add_sample(phrase_curve) if return_dict: label_dict[id_name] = np_atom_labels if dir_out is not None: # Save phrase, because it might be used in synthesis. phrase_curve.astype('float32').tofile( os.path.join(dir_out, id_name + self.ext_phrase)) # Save atoms binary (float32). np_atom_labels.astype('float32').tofile( os.path.join(dir_out, id_name + self.ext_atoms)) # Create a readable version of the atom data. # np.savetxt(os.path.join(dir_out, id_name + self.ext_atoms + ".txt"), np_atom_labels) # Manually set mean of atoms to 0, otherwise frames without atom will have an amplitude. mean_std_ext_atom.sum_frames[:] = 0.0 mean_std_ext_atom.sum_squared_frames[ 1] = mean_std_ext_atom.sum_length * self.theta_interval[-1] mean_std_ext_atom.save(os.path.join(dir_out, file_id_list_name)) min_max_ext_atom.save(os.path.join(dir_out, file_id_list_name)) # mean_std_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase')) # min_max_ext_phrase.save(os.path.join(dir_out, file_id_list_name + '-phrase')) mean_atoms, std_atoms = mean_std_ext_atom.get_params() min_atoms, max_atoms = min_max_ext_atom.get_params() # mean_phrase, std_phrase = mean_std_ext_phrase.get_params() # min_phrase, max_phrase = min_max_ext_atom.get_params() # Use this block to save the part of the file_id_list for which atom extraction was successful into a new file. if correct_utts: with open( os.path.join( os.path.dirname(dir_in), "wcad_" + os.path.basename(file_id_list_name) + ".txt"), 'w') as f: f.write('\n'.join(correct_utts) + '\n') if return_dict: # Return dict of labels for all utterances. return label_dict, \ mean_atoms, std_atoms, \ min_atoms, max_atoms # mean_phrase, std_phrase, \ # min_phrase, max_phrase else: return mean_atoms, std_atoms, \ min_atoms, max_atoms
def process_file(self, file, dir_wav, dir_out, audio_format="wav", silence_threshold_db=-50, chunk_size_ms=10): sound = AudioSegment.from_file(os.path.join(dir_wav, file), format=audio_format) trim_start = self._detect_leading_silence(sound, silence_threshold_db, chunk_size_ms) trim_end = self._detect_leading_silence(sound.reverse(), silence_threshold_db, chunk_size_ms) # Add silence to the front if audio starts to early. if trim_start < self.min_silence_ms: # TODO: Find a robust way to create silence so that HTK alignment still works (maybe concat mirrored segments). logging.warning( "File {} has only {} ms of silence in the beginning.".format( file, trim_start)) # AudioSegment.silent(duration=self.min_silence_ms-trim_start) # if trim_start > 0: # silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start] # sound = silence + sound # elif trim_end > 0: # silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end] # sound = silence + sound # else: # self.logger.warning("Cannot append silence to the front of " + file + ". No silence exists at front or end which can be copied.") trim_start = 0 else: trim_start -= self.min_silence_ms # Append silence if audio ends too late. if trim_end < self.min_silence_ms: logging.warning( "File {} has only {} ms of silence in the end.".format( file, trim_end)) # silence = AudioSegment.silent(duration=self.min_silence_ms-trim_end) # if trim_end > 0: # silence = (sound[-trim_end:] * (math.ceil(self.min_silence_ms / trim_end) - 1))[:self.min_silence_ms-trim_end] # sound = sound + silence # elif trim_start > 0: # silence = (sound[:trim_start] * (math.ceil(self.min_silence_ms / trim_start) - 1))[:self.min_silence_ms-trim_start] # sound = sound + silence # else: # self.logger.warning("Cannot append silence to the end of " + file + ". No silence exists at front or end which can be copied.") trim_end = 0 else: trim_end -= self.min_silence_ms # Trim the sound. trimmed_sound = sound[trim_start:-trim_end - 1] # Save trimmed sound to file. out_file = os.path.join(dir_out, file) makedirs_safe(os.path.dirname(out_file)) trimmed_sound.export(out_file, format=audio_format) return trimmed_sound
def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-w", "--dir_wav", help="Directory containing the wav files.", type=str, dest="dir_wav", required=True) parser.add_argument("-o", "--dir_out", help="Directory to save the trimmed files.", type=str, dest="dir_out", required=True) parser.add_argument("-f", "--file_id_list", help="Full path to file containing the ids.", type=str, dest="file_id_list", required=True) parser.add_argument("--format", help="Format of the audio file, e.g. WAV.", type=str, dest="format", required=False, default='wav') parser.add_argument( "--silence_db", help="Threshold until which a frame is considered to be silent.", type=int, dest="silence_threshold_db", required=False, default=-50) parser.add_argument( "--chunk_size", help="Size of the chunk (frame size) in ms on which db is computed.", type=int, dest="chunk_size_ms", required=False, default=10) parser.add_argument( "--min_silence_ms", help= "Milliseconds of silence which are always kept in front and back of audio file.", type=int, dest="min_silence_ms", required=False) # Parse arguments args = parser.parse_args() # Read which files to process. with open(args.file_id_list) as f: id_list = f.readlines() # Trim entries in-place. id_list[:] = [s.strip(' \t\n\r') for s in id_list] # Create output directory if missing. makedirs_safe(args.dir_out) # Start silence removal. silence_remover = SilenceRemover() if args.min_silence_ms is not None: silence_remover.min_silence_ms = args.min_silence_ms silence_remover.process_list(id_list, args.dir_wav, args.dir_out, args.format, args.silence_threshold_db, args.chunk_size_ms)
def gen_data(self, dir_in, dir_out=None, file_id_list=None, id_list=None, add_deltas=False, return_dict=False): """ Prepare WORLD features from audio files. If add_delta is false labels have the dimension num_frames x (num_coded_sps + 3) [mgc(num_coded_sps), lf0, vuv, bap(1)], otherwise the deltas and double deltas are added between the features resulting in num_frames x (3*num_coded_sps + 7) [mgc(3*num_coded_sps), lf0(3*1), vuv, bap(3*1)]. :param dir_in: Directory where the .wav files are stored for each utterance to process. :param dir_out: Main directory where the labels and normalisation parameters are saved to subdirectories. If None, labels are not saved. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all file in audio_dir are used. :param add_deltas: Add deltas and double deltas to all features except vuv. :param return_dict: If true, returns an OrderedDict of all samples as first output. :return: Returns two normalisation parameters as tuple. If return_dict is True it returns all processed labels in an OrderedDict followed by the two normalisation parameters. """ # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.wav")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext( os.path.basename(file_id_list))[0] # Create directories in dir_out if it is given. if dir_out is not None: if add_deltas: makedirs_safe(os.path.join(dir_out, self.dir_deltas)) else: makedirs_safe(os.path.join(dir_out, self.dir_lf0)) makedirs_safe(os.path.join(dir_out, self.dir_vuv)) makedirs_safe(os.path.join(dir_out, self.dir_coded_sps)) makedirs_safe(os.path.join(dir_out, self.dir_bap)) # Create the return dictionary if required. if return_dict: label_dict = OrderedDict() if add_deltas: # Create normalisation computation units. norm_params_ext_coded_sp = MeanCovarianceExtractor() norm_params_ext_lf0 = MeanCovarianceExtractor() norm_params_ext_bap = MeanCovarianceExtractor() else: # Create normalisation computation units. norm_params_ext_coded_sp = MeanStdDevExtractor() norm_params_ext_lf0 = MeanStdDevExtractor() # norm_params_ext_vuv = MeanStdDevExtractor() norm_params_ext_bap = MeanStdDevExtractor() logging.info("Extract WORLD{} features for".format( "" if not add_deltas else " deltas") + "[{0}]".format(", ".join(str(i) for i in id_list))) for file_name in id_list: # Load audio file and extract features. audio_name = os.path.join(dir_in, file_name + ".wav") raw, fs = soundfile.read(audio_name) logging.debug("Extract WORLD{} features from {} at {}Hz.".format( "" if not add_deltas else " deltas", file_name, fs)) f0, sp, ap = pyworld.wav2world(raw, fs) file_name = os.path.basename(file_name) # Remove speaker. # Compute lf0 and vuv information. lf0 = np.log(f0.clip(min=1E-10), dtype=np.float32) lf0[lf0 <= math.log(self.f0_silence_threshold)] = self.lf0_zero lf0, vuv = interpolate_lin(lf0) lf0 = lf0.astype(dtype=np.float32) vuv = vuv.astype(dtype=np.float32) # Throw a warning when less then 5% of all frames are unvoiced. if vuv.sum() / len(vuv) < 0.05: self.logger.warning( "Detected only {:.0f}% [{}/{}] unvoiced frames in {}.". format(vuv.sum() / len(vuv) * 100.0, int(vuv.sum()), len(vuv), file_name)) # Decode spectrum to a lower dimension and aperiodicity to one band aperiodicity. # coded_sp = pyworld.code_spectral_envelope(sp, fs, WorldFeatLabelGen.num_coded_sps) # Cepstral version. coded_sp = np.sqrt(sp) * 32768.0 coded_sp = np.array(pysptk.mcep(coded_sp, order=self.num_coded_sps - 1, alpha=self.mgc_alpha, eps=1.0e-8, min_det=0.0, etype=1, itype=3), dtype=np.float32) bap = np.array(pyworld.code_aperiodicity(ap, fs), dtype=np.float32) if add_deltas: # Compute the deltas and double deltas for all features. lf0_deltas, lf0_double_deltas = compute_deltas(lf0) coded_sp_deltas, coded_sp_double_deltas = compute_deltas( coded_sp) bap_deltas, bap_double_deltas = compute_deltas(bap) coded_sp = np.concatenate( (coded_sp, coded_sp_deltas, coded_sp_double_deltas), axis=1) lf0 = np.concatenate((lf0, lf0_deltas, lf0_double_deltas), axis=1) bap = np.concatenate((bap, bap_deltas, bap_double_deltas), axis=1) # Combine them to a single feature sample. labels = np.concatenate((coded_sp, lf0, vuv, bap), axis=1) # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = labels if dir_out is not None: labels.tofile( os.path.join(dir_out, self.dir_deltas, file_name + self.ext_deltas)) else: # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = np.concatenate( (coded_sp, lf0, vuv, bap), axis=1) if dir_out is not None: coded_sp.tofile( os.path.join(dir_out, self.dir_coded_sps, file_name + self.ext_coded_sp)) lf0.tofile( os.path.join(dir_out, self.dir_lf0, file_name + self.ext_lf0)) vuv.astype(np.float32).tofile( os.path.join(dir_out, self.dir_vuv, file_name + self.ext_vuv)) bap.tofile( os.path.join(dir_out, self.dir_bap, file_name + self.ext_bap)) # Add sample to normalisation computation unit. norm_params_ext_coded_sp.add_sample(coded_sp) norm_params_ext_lf0.add_sample(lf0) # norm_params_ext_vuv.add_sample(vuv) norm_params_ext_bap.add_sample(bap) # Save mean and std dev of all features. if not add_deltas: norm_params_ext_coded_sp.save( os.path.join(dir_out, self.dir_coded_sps, file_id_list_name)) norm_params_ext_lf0.save( os.path.join(dir_out, self.dir_lf0, file_id_list_name)) # norm_params_ext_vuv.save(os.path.join(dir_out, WorldFeatLabelGen.dir_vuv, file_id_list_name)) norm_params_ext_bap.save( os.path.join(dir_out, self.dir_bap, file_id_list_name)) else: self.logger.info("Write norm_prams to{}".format( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_coded_sps))))) norm_params_ext_coded_sp.save( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_coded_sps)))) norm_params_ext_lf0.save( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_lf0)))) norm_params_ext_bap.save( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_bap)))) # Get normalisation parameters. if not add_deltas: norm_coded_sp = norm_params_ext_coded_sp.get_params() norm_lf0 = norm_params_ext_lf0.get_params() # norm_vuv = norm_params_ext_vuv.get_params() norm_bap = norm_params_ext_bap.get_params() norm_first = np.concatenate( (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0]), axis=0) norm_second = np.concatenate( (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1]), axis=0) else: norm_coded_sp = norm_params_ext_coded_sp.get_params() norm_lf0 = norm_params_ext_lf0.get_params() # norm_vuv = norm_params_ext_vuv.get_params() norm_bap = norm_params_ext_bap.get_params() norm_first = (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0]) norm_second = (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1]) if return_dict: # Return dict of labels for all utterances. return label_dict, norm_first, norm_second else: return norm_first, norm_second
def gen_data(self, dir_in, dir_out=None, file_id_list=None, id_list=None, add_deltas=False, return_dict=False): """ Prepare LF0 and V/UV features from audio files. If add_delta is false each numpy array has the dimension num_frames x 2 [f0, vuv], otherwise the deltas and double deltas are added between the features resulting in num_frames x 4 [lf0(3*1), vuv]. :param dir_in: Directory where the .wav files are stored for each utterance to process. :param dir_out: Main directory where the labels and normalisation parameters are saved to subdirectories. If None, labels are not saved. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all file in audio_dir are used. :param add_deltas: Add deltas and double deltas to all features except vuv. :param return_dict: If true, returns an OrderedDict of all samples as first output. :return: Returns two normalisation parameters as tuple. If return_dict is True it returns all processed labels in an OrderedDict followed by the two normalisation parameters. """ # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.wav")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext(os.path.basename(file_id_list))[0] # Create directories in dir_out if it is given. if dir_out is not None: if add_deltas: makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_deltas)) else: makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_lf0)) makedirs_safe(os.path.join(dir_out, LF0LabelGen.dir_vuv)) # Create the return dictionary if required. if return_dict: label_dict = OrderedDict() # Create normalisation computation units. norm_params_ext_lf0 = MeanStdDevExtractor() # norm_params_ext_vuv = MeanStdDevExtractor() norm_params_ext_deltas = MeanStdDevExtractor() logging.info("Extract WORLD LF0 features for " + "[{0}]".format(", ".join(str(i) for i in id_list))) for file_name in id_list: logging.debug("Extract WORLD LF0 features from " + file_name) # Load audio file and extract features. audio_name = os.path.join(dir_in, file_name + ".wav") raw, fs = soundfile.read(audio_name) _f0, t = pyworld.dio(raw, fs) # Raw pitch extraction. TODO: Use magphase here? f0 = pyworld.stonemask(raw, _f0, t, fs) # Pitch refinement. # Compute lf0 and vuv information. lf0 = np.log(f0, dtype=np.float32) lf0[lf0 <= math.log(LF0LabelGen.f0_silence_threshold)] = LF0LabelGen.lf0_zero lf0, vuv = interpolate_lin(lf0) if add_deltas: # Compute the deltas and double deltas for all features. lf0_deltas, lf0_double_deltas = compute_deltas(lf0) # Combine them to a single feature sample. labels = np.concatenate((lf0, lf0_deltas, lf0_double_deltas, vuv), axis=1) # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = labels if dir_out is not None: labels.tofile(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_name + LF0LabelGen.ext_deltas)) # Add sample to normalisation computation unit. norm_params_ext_deltas.add_sample(labels) else: # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = np.concatenate((lf0, vuv), axis=1) if dir_out is not None: lf0.tofile(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_name + LF0LabelGen.ext_lf0)) vuv.astype(np.float32).tofile(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_name + LF0LabelGen.ext_vuv)) # Add sample to normalisation computation unit. norm_params_ext_lf0.add_sample(lf0) # norm_params_ext_vuv.add_sample(vuv) # Save mean and std dev of all features. if not add_deltas: norm_params_ext_lf0.save(os.path.join(dir_out, LF0LabelGen.dir_lf0, file_id_list_name)) # norm_params_ext_vuv.save(os.path.join(dir_out, LF0LabelGen.dir_vuv, file_id_list_name)) else: # Manually set vuv normalisation parameters before saving. norm_params_ext_deltas.sum_frames[-1] = 0.0 # Mean = 0.0 norm_params_ext_deltas.sum_squared_frames[-1] = norm_params_ext_deltas.sum_length # Variance = 1.0 norm_params_ext_deltas.save(os.path.join(dir_out, LF0LabelGen.dir_deltas, file_id_list_name)) # Get normalisation parameters. if not add_deltas: norm_lf0 = norm_params_ext_lf0.get_params() # norm_vuv = norm_params_ext_vuv.get_params() norm_first = np.concatenate((norm_lf0[0], (0.0,)), axis=0) norm_second = np.concatenate((norm_lf0[1], (1.0,)), axis=0) else: norm_first, norm_second = norm_params_ext_deltas.get_params() if return_dict: # Return dict of labels for all utterances. return label_dict, norm_first, norm_second else: return norm_first, norm_second