def compute_statistics(): """Compute mean / std statistics of some features for later normalization.""" config = parse_and_config() # find features files for the train split glob_fn = lambda x: glob.glob( os.path.join(config["rootdir"], "train", x, "*.npy")) glob_mel = glob_fn("raw-feats") glob_f0 = glob_fn("raw-f0") glob_energy = glob_fn("raw-energies") assert ( len(glob_mel) == len(glob_f0) == len(glob_energy) ), "Features, f0 and energies have different files in training split." logging.info(f"Computing statistics for {len(glob_mel)} files.") # init scaler for multiple features scaler_mel = StandardScaler(copy=False) scaler_energy = StandardScaler(copy=False) scaler_f0 = StandardScaler(copy=False) for mel, f0, energy in tqdm(zip(glob_mel, glob_f0, glob_energy), total=len(glob_mel)): # remove outliers energy = remove_outlier(np.load(energy)) f0 = remove_outlier(np.load(f0)) # partial fitting of scalers scaler_mel.partial_fit(np.load(mel)) scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1)) scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1)) # save statistics to file logging.info("Saving computed statistics.") scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")] save_statistics_to_file(scaler_list, config)
def gen_audio_features(item, config): """Generate audio features and transformations Args: item (Dict): dictionary containing the attributes to encode. config (Dict): configuration dictionary. Returns: (bool): keep this sample or not. mel (ndarray): mel matrix in np.float32. energy (ndarray): energy audio profile. f0 (ndarray): fundamental frequency. item (Dict): dictionary containing the updated attributes. """ # get info from sample. audio = item["audio"] utt_id = item["utt_id"] rate = item["rate"] # check audio properties assert len(audio.shape) == 1, f"{utt_id} seems to be multi-channel signal." assert np.abs( audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM." # check sample rate if rate != config["sampling_rate"]: audio = librosa.resample(audio, rate, config["sampling_rate"]) logging.info( f"{utt_id} sampling rate is {rate}, not {config['sampling_rate']}, we resample it." ) # trim silence if config["trim_silence"]: if "trim_mfa" in config and config["trim_mfa"]: _, item["text_ids"], audio = ph_based_trim( config, utt_id, item["text_ids"], item["raw_text"], audio, config["hop_size"], ) if ( audio.__len__() < 1 ): # very short files can get trimmed fully if mfa didnt extract any tokens LibriTTS maybe take only longer files? logging.warning( f"File have only silence or MFA didnt extract any token {utt_id}" ) return False, None, None, None, item else: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"], ) # resample audio if necessary if "sampling_rate_for_feats" in config: audio = librosa.resample(audio, rate, config["sampling_rate_for_feats"]) sampling_rate = config["sampling_rate_for_feats"] assert ( config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0 ), "'hop_size' must be 'int' value. Please check if 'sampling_rate_for_feats' is correct." hop_size = config["hop_size"] * config[ "sampling_rate_for_feats"] // rate else: sampling_rate = config["sampling_rate"] hop_size = config["hop_size"] # get spectrogram D = librosa.stft( audio, n_fft=config["fft_size"], hop_length=hop_size, win_length=config["win_length"], window=config["window"], pad_mode="reflect", ) S, _ = librosa.magphase(D) # (#bins, #frames) # get mel basis fmin = 0 if config["fmin"] is None else config["fmin"] fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"] mel_basis = librosa.filters.mel( sr=sampling_rate, n_fft=config["fft_size"], n_mels=config["num_mels"], fmin=fmin, fmax=fmax, ) mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T # (#frames, #bins) mel_eos = np.zeros(shape=[1, np.shape(mel)[1] ]) # (1, #bins) # represent mel for eos_token. mel = np.concatenate([mel, mel_eos], axis=0) # (#frames + 1, #bins) # check audio and feature length audio_eos = np.zeros( shape=[hop_size]) # (hop_size) # represent audio for eos_token. audio = np.concatenate([audio, audio_eos], axis=-1) audio = np.pad(audio, (0, config["fft_size"]), mode="edge") audio = audio[:len(mel) * hop_size] assert len(mel) * hop_size == len( audio), f"{len(mel) * hope_size}, {len(audio)}" # extract raw pitch _f0, t = pw.dio( audio.astype(np.double), fs=sampling_rate, f0_ceil=fmax, frame_period=1000 * hop_size / sampling_rate, ) f0 = pw.stonemask(audio.astype(np.double), _f0, t, sampling_rate) if len(f0) >= len(mel): f0 = f0[:len(mel)] else: f0 = np.pad(f0, (0, len(mel) - len(f0))) # extract energy energy = np.sqrt(np.sum(S**2, axis=0)) energy = np.concatenate([energy, [0]], axis=-1) # # represent energy for eos_token. assert len(mel) == len(f0) == len( energy), f"{len(mel)}, {len(f0)}, {len(energy)}" # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() >= 1.0: logging.warn( f"{utt_id} causes clipping. It is better to reconsider global gain scale value." ) item["audio"] = audio item["mel"] = mel item["f0"] = remove_outlier(f0) item["energy"] = remove_outlier(energy) return True, mel, energy, f0, item
def preprocess(): """Run preprocessing process and compute statistics for normalizing.""" config = parse_and_config() dataset_processor = { "ljspeech": LJSpeechProcessor, "kss": KSSProcessor, "multispeaker": ExampleMultispeaker, } dataset_cleaner = { "ljspeech": "english_cleaners", "kss": "korean_cleaners", "multispeaker": None, } logging.info(f"Selected '{config['dataset']}' processor.") processor = dataset_processor[config["dataset"]]( config["rootdir"], cleaner_names=dataset_cleaner[config["dataset"]]) # check output directories build_dir = lambda x: [ os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True) for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"] ] build_dir("train") build_dir("valid") # build train test split if config["dataset"] == "multispeaker": train_split, valid_split, _, _ = train_test_split( processor.items, [i[-1] for i in processor.items], test_size=config["test_size"], random_state=42, shuffle=True, ) else: train_split, valid_split = train_test_split( processor.items, test_size=config["test_size"], random_state=42, shuffle=True, ) logging.info(f"Training items: {len(train_split)}") logging.info(f"Validation items: {len(valid_split)}") get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0] train_utt_ids = [get_utt_id(x) for x in train_split] valid_utt_ids = [get_utt_id(x) for x in valid_split] # save train and valid utt_ids to track later np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids) np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids) # define map iterator def iterator_data(items_list): for item in items_list: yield processor.get_one_sample(item) train_iterator_data = iterator_data(train_split) valid_iterator_data = iterator_data(valid_split) p = Pool(config["n_cpus"]) # preprocess train files and get statistics for normalizing partial_fn = partial(gen_audio_features, config=config) train_map = p.imap_unordered( partial_fn, tqdm(train_iterator_data, total=len(train_split), desc="[Preprocessing train]"), chunksize=10, ) # init scaler for multiple features scaler_mel = StandardScaler(copy=False) scaler_energy = StandardScaler(copy=False) scaler_f0 = StandardScaler(copy=False) id_to_remove = [] for result, mel, energy, f0, features in train_map: if not result: id_to_remove.append(features["utt_id"]) continue save_features_to_file(features, "train", config) # remove outliers energy = remove_outlier(energy) f0 = remove_outlier(f0) # partial fitting of scalers if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0: id_to_remove.append(features["utt_id"]) continue scaler_mel.partial_fit(mel) scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1)) scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1)) if len(id_to_remove) > 0: np.save( os.path.join(config["outdir"], "train_utt_ids.npy"), [i for i in train_utt_ids if i not in id_to_remove], ) logging.info( f"removed {len(id_to_remove)} cause of too many outliers or bad mfa extraction" ) # save statistics to file logging.info("Saving computed statistics.") scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")] save_statistics_to_file(scaler_list, config) # preprocess valid files partial_fn = partial(gen_audio_features, config=config) valid_map = p.imap_unordered( partial_fn, tqdm(valid_iterator_data, total=len(valid_split), desc="[Preprocessing valid]"), chunksize=10, ) for *_, features in valid_map: save_features_to_file(features, "valid", config)
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of dumped raw features " "(See detail in tensorflow_tts/bin/compute_statistics.py).") parser.add_argument("--rootdir", type=str, required=True, help="directory including feature files. ") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument("--outdir", default=None, type=str, required=True, help="directory to save statistics.") parser.add_argument("--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('Skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check directory existence if args.outdir is None: args.outdir = os.path.dirname(args.rootdir) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # get dataset if config["format"] == "npy": mel_query = "*-raw-feats.npy" f0_query = "*-raw-f0.npy" energy_query = "*-raw-energy.npy" mel_load_fn = np.load else: raise ValueError("Support only npy format.") dataset = MelDataset( args.rootdir, mel_query=mel_query, mel_load_fn=mel_load_fn ).create(batch_size=1) # calculate statistics scaler = StandardScaler() for mel, mel_length in tqdm(dataset): mel = mel[0].numpy() scaler.partial_fit(mel) # save to file stats = np.stack([scaler.mean_, scaler.scale_], axis=0) np.save(os.path.join(args.outdir, "stats.npy"), stats.astype(np.float32), allow_pickle=False) # calculate statistic of f0 f0_dataset = AudioDataset( args.rootdir, audio_query=f0_query, audio_load_fn=np.load, ).create(batch_size=1) pitch_vecs = [] for f0, f0_length in tqdm(f0_dataset): f0 = f0[0].numpy() # [T] f0 = remove_outlier(f0) pitch_vecs.append(f0) nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in pitch_vecs]) mean, std = np.mean(nonzeros), np.std(nonzeros) # save to file stats = np.stack([mean, std], axis=0) np.save(os.path.join(args.outdir, "stats_f0.npy"), stats.astype(np.float32), allow_pickle=False) # calculate statistic of energy energy_dataset = AudioDataset( args.rootdir, audio_query=energy_query, audio_load_fn=np.load, ).create(batch_size=1) energy_vecs = [] for e, e_length in tqdm(energy_dataset): e = e[0].numpy() e = remove_outlier(e) energy_vecs.append(e) nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in energy_vecs]) mean, std = np.mean(nonzeros), np.std(nonzeros) # save to file stats = np.stack([mean, std], axis=0) np.save(os.path.join(args.outdir, "stats_energy.npy"), stats.astype(np.float32), allow_pickle=False)
def _norm_mean_std(self, x, mean, std): x = remove_outlier(x) zero_idxs = np.where(x == 0.0)[0] x = (x - mean) / std x[zero_idxs] = 0.0 return x
def preprocess_multispeaker(): """Run preprocessing process and compute statistics for normalizing.""" config = parse_and_config() # DIFFERENCE dataset_processor = {"multispeaker": MultiSpeakerProcessor} logging.info(f"Selected '{config['dataset']}' processor.") processor = dataset_processor[config["dataset"]]( config["rootdir"], cleaner_names="english_cleaners") # check output directories build_dir = lambda x: [ os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True) for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"] ] build_dir("train") build_dir("valid") # DIFFERENCE bul_items = processor.items[:671] synd_items = processor.items[671:] assert ( len(bul_items) == 671 and len(synd_items) == 302 ), f"SPLIT WAS UNSUCCESSFUL bul:{len(bul_items)} synd:{len(synd_items)}" train_split = [] valid_split = [] # build train test split bul_train_split, bul_valid_split = train_test_split( bul_items, test_size=config["test_size"], random_state=42, shuffle=True, ) synd_train_split, synd_valid_split = train_test_split( synd_items, test_size=config["test_size"], random_state=42, shuffle=True, ) train_split.extend(bul_train_split) train_split.extend(synd_train_split) valid_split.extend(bul_valid_split) valid_split.extend(synd_valid_split) assert ( len(train_split) + len(valid_split) == 973 ), f"SPLIT WAS UNSUCCESSFUL train:{len(train_split)} valid:{len(valid_split)}" logging.info(f"Training items: {len(train_split)}") logging.info(f"Validation items: {len(valid_split)}") get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0] train_utt_ids = [get_utt_id(x) for x in train_split] valid_utt_ids = [get_utt_id(x) for x in valid_split] # save train and valid utt_ids to track later np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids) np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids) # define map iterator def iterator_data(items_list): for item in items_list: yield processor.get_one_sample(item) train_iterator_data = iterator_data(train_split) valid_iterator_data = iterator_data(valid_split) p = Pool(config["n_cpus"]) # preprocess train files and get statistics for normalizing partial_fn = partial(gen_audio_features, config=config) train_map = p.imap_unordered( partial_fn, tqdm(train_iterator_data, total=len(train_split), desc="[Preprocessing train]"), chunksize=10, ) # init scaler for multiple features scaler_mel = StandardScaler(copy=False) scaler_energy = StandardScaler(copy=False) scaler_f0 = StandardScaler(copy=False) for mel, energy, f0, features in train_map: save_features_to_file(features, "train", config) # remove outliers energy = remove_outlier(energy) f0 = remove_outlier(f0) # partial fitting of scalers scaler_mel.partial_fit(mel) scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1)) scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1)) # save statistics to file logging.info("Saving computed statistics.") scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")] save_statistics_to_file(scaler_list, config) # preprocess valid files partial_fn = partial(gen_audio_features, config=config) valid_map = p.imap_unordered( partial_fn, tqdm(valid_iterator_data, total=len(valid_split), desc="[Preprocessing valid]"), chunksize=10, ) for *_, features in valid_map: save_features_to_file(features, "valid", config)
def generate(data): tid = data["tid"] audio = data["audio"] mels = data["mels"] # If the fft size is 2048, audio with 22050Hz sample rate is processed in 93 millisecond increments. # If it is 512, it is processed by 23 milliseconds. # lpcnet spec fft_size = 320 # or 640 processed16000Hz 20ms or 40ms hop_size = 160 # samplerate = 16000 # check audio properties assert len(audio.shape) == 1, f"{tid} seems to be multi-channel signal." assert np.abs(audio).max() <= 1.0, f"{tid} is different from 16 bit PCM." # get spectrogram D = librosa.stft( audio, n_fft=fft_size, hop_length=hop_size, # default: win_length // 4 win_length=None, # default: win_length = n_fft window='hann', # default: cosine window (‘hann’) pad_mode="reflect") S, _ = librosa.magphase(D) # (#bins, #frames) # check audio and feature length audio = np.pad(audio, (0, 3200), mode="edge") audio = audio[:mels * hop_size] assert mels * hop_size == len(audio) # extract raw pitch _f0, t = pw.dio(audio.astype(np.double), fs=samplerate, f0_ceil=7600, frame_period=1000 * hop_size / samplerate) f0 = pw.stonemask(audio.astype(np.double), _f0, t, samplerate) if len(f0) >= mels: f0 = f0[:mels] else: f0 = np.pad(f0, (0, mels - len(f0))) # extract energy energy = np.sqrt(np.sum(S**2, axis=0)) if len(energy) >= mels: energy = energy[:mels] else: energy = np.pad(energy, (0, mels - len(energy))) assert mels == len(f0) == len(energy) # remove outlier f0/energy f0 = remove_outlier(f0) energy = remove_outlier(energy) item = {} item["tid"] = tid item["f0"] = f0 item["energy"] = energy return item