def main(config, random_seed, dist, apply_normalization, n_pad): """ 构建 IRM(Ideal ratio mask)语音增强数据集 数据集为语句级别,带噪语音和它相应纯净语音的频谱尺寸相同 Steps: 1. 加载纯净语音信号 2. 加载噪声文件 3. 在纯净语音信号上叠加噪声信号 4. 计算频谱,mask等 5. 分别存储带噪语音的频谱与 mask Args: config (dict): 配置信息 random_seed (int): 随机种子 dist (str): 输出结果的目录 apply_normalization (bool): 是否对 mixture 语音进行规范化 n_pad (int): mixture 语音中帧的拓展范围,拓展后中心帧对应 mask 中的一帧 Dataset: dataset_1/ mixture.npy mask.npy ... mixture.npy is { "0001_babble_-5": (257, T * (n_pad * 2 + 1)), "0001_babble_-10": (257, T * T * (n_pad * 2 + 1)) ... } mask.npy is { "0001_babble_-5": (257, T), "0001_babble_-10": (257, T), ... } """ global clean_lps np.random.seed(random_seed) dist_dir = Path(dist) # 以遍历的方式读取 config.json 中各个数据集的配置项 for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): dataset_dir = dist_dir / dataset_cfg["name"] prepare_empty_dirs([dataset_dir]) print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) # 加载纯净语音信号,存至 list 中 clean_cfg = dataset_cfg["clean"] clean_speech_paths = librosa.util.find_files( directory=clean_cfg["database"], ext=clean_cfg["ext"], recurse=clean_cfg["recurse"], limit=clean_cfg["limit"], offset=clean_cfg["offset"]) random.shuffle(clean_speech_paths) clean_ys = load_wavs( file_paths=clean_speech_paths, sr=clean_cfg["sampling_rate"], min_sampling=clean_cfg["min_sampling"], ) print("Loaded clean speeches.") # 加载噪声信号,存至 dict 中 noise_cfg = dataset_cfg["noise"] noise_database_dir = Path(noise_cfg["database"]) noise_ys = {} for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): mixture, _ = librosa.load( (noise_database_dir / (noise_type + ".wav")).as_posix(), sr=noise_cfg["sampling_rate"]) noise_ys[noise_type] = mixture print("Loaded noise.") # 合成带噪语音 mixture_store = {} mask_store = {} for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): num = str(i).zfill(4) for snr in dataset_cfg["snr"]: for noise_type in noise_ys.keys(): basename_text = f"{num}_{noise_type}_{snr}" clean, noise = corrected_the_length_of_noise_and_clean_speech( clean_y=clean, noise_y=noise_ys[noise_type]) mixture = add_noise_for_waveform(clean, noise, int(snr)) mixture_mag = mag(mixture) clean_mag = mag(clean) noise_mag = mag(noise) if apply_normalization: mixture_mag = input_normalization(mixture_mag) mixture_mag = unfold_spectrum(mixture_mag, n_pad=n_pad) mask = noise_mag / (noise_mag + clean_mag) assert mixture_mag.shape[0] == mask.shape[0] == 257 mixture_store[basename_text] = mixture_mag mask_store[basename_text] = mask print(f"Synthesize finished,storing file...") joblib.dump(mask_store, (dataset_dir / "mask.pkl").as_posix()) joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
def main(config, random_seed, dist, n_pad): """ 构建*频域*上的语音增强数据集(Log Power Spectrum) 每句带噪语音的时间步上都包含多帧,多帧的中心帧对应这个时间步上的一帧纯净语音 中心帧前面的时间帧: 中心帧后面的时间帧: TODO 文档等待进一步更新 Steps: 1. 加载纯净语音信号 2. 加载噪声文件 3. 在纯净语音信号上叠加噪声信号 4. 分别计算 LPS 特征 5. 将带噪语音的 LPS 特征进行拓展 5. 分别存储带噪语音与纯净语音 Args: config (dict): 配置信息 random_seed (int): 随机种子 dist (str): 输出结果的目录 n_pad (int): 带噪语音的拓展大小 Dataset: dataset_1/ mixture.npy clean.npy ... mixture.npy is { "0001_babble_-5": (257 * 3 * , T), "0001_babble_-10": (257 * 3, T), ... } clean.npy is { "0001": (257, T), "0002": (257, T), ... } """ global clean_lps np.random.seed(random_seed) dist_dir = Path(dist) # 以遍历的方式读取 config.json 中各个数据集的配置项 for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): dataset_dir = dist_dir / dataset_cfg["name"] prepare_empty_dirs([dataset_dir]) print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) # 加载纯净语音信号,存至 list 中 clean_cfg = dataset_cfg["clean"] clean_speech_paths = librosa.util.find_files( directory=clean_cfg["database"], ext=clean_cfg["ext"], recurse=clean_cfg["recurse"], limit=clean_cfg["limit"], offset=clean_cfg["offset"]) random.shuffle(clean_speech_paths) clean_ys = load_wavs( file_paths=clean_speech_paths, sr=clean_cfg["sampling_rate"], min_sampling=clean_cfg["min_sampling"], ) print("Loaded clean speeches.") # 加载噪声信号,存至 dict 中 noise_cfg = dataset_cfg["noise"] noise_database_dir = Path(noise_cfg["database"]) noise_ys = {} for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): mixture, _ = librosa.load( (noise_database_dir / (noise_type + ".wav")).as_posix(), sr=noise_cfg["sampling_rate"]) noise_ys[noise_type] = mixture print("Loaded noise.") # 合成带噪语音 mixture_store = {} clean_store = {} for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): num = str(i).zfill(4) for snr in dataset_cfg["snr"]: for noise_type in noise_ys.keys(): basename_text = f"{num}_{noise_type}_{snr}" clean, noise = corrected_the_length_of_noise_and_clean_speech( clean_y=clean, noise_y=noise_ys[noise_type]) mixture = add_noise_for_waveform(clean, noise, int(snr)) assert len(mixture) == len(clean) == len(noise) mixture_lps = lps(mixture) clean_lps = lps(clean) mixture_lps = unfold_spectrum(mixture_lps, n_pad=n_pad) assert mixture_lps.shape[0] == clean_lps.shape[0] == 257 mixture_store[basename_text] = mixture_lps clean_store[num] = clean_lps print(f"Synthesize finished,storing file...") joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix()) joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
def main(config, random_seed, dist): """ 构建时域上的语音增强数据集 Steps: 1. 加载纯净语音信号 2. 加载噪声文件 3. 在纯净语音信号上叠加噪声信号 4. 分别存储带噪语音与纯净语音 Args: config (dict): 配置信息 random_seed (int): 随机种子 dist (str): 输出结果的目录 Dataset: dataset_1/ mixture.npy clean.npy ... mixture.npy is { "0001_babble_-5": [signals, ...], "0001_babble_-10": [signals, ...], ... } clean.npy is { "0001": [signals, ...], "0002": [signals, ...], ... } """ np.random.seed(random_seed) dist_dir = Path(dist) # 以遍历的方式读取 config.json 中各个数据集的配置项 for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): dataset_dir = dist_dir / dataset_cfg["name"] prepare_empty_dirs([dataset_dir, dataset_dir / "Clean", dataset_dir / "Noisy"]) print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) # 加载纯净语音信号,存至 list 中 clean_cfg = dataset_cfg["clean"] clean_speech_paths = librosa.util.find_files( directory=clean_cfg["database"], ext=clean_cfg["ext"], recurse=clean_cfg["recurse"], limit=clean_cfg["limit"], offset=clean_cfg["offset"] ) random.shuffle(clean_speech_paths) clean_ys = load_wavs( file_paths=clean_speech_paths, sr=clean_cfg["sampling_rate"], min_sampling=clean_cfg["min_sampling"], ) print("Loaded clean speeches.") # 加载噪声信号,存至 dict 中 noise_cfg = dataset_cfg["noise"] noise_database_dir = Path(noise_cfg["database"]) noise_ys = {} for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): mixture, _ = librosa.load( (noise_database_dir / (noise_type + ".wav")).as_posix(), sr=noise_cfg["sampling_rate"]) noise_ys[noise_type] = mixture print("Loaded noise.") # 合成带噪语音 n = 0 for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): for snr in dataset_cfg["snr"]: for noise_type in noise_ys.keys(): clean, noise = corrected_the_length_of_noise_and_clean_speech( clean_y=clean, noise_y=noise_ys[noise_type] ) mixture = add_noise_for_waveform(clean, noise, int(snr)) assert len(mixture) == len(clean) == len(noise) fname = f"{dataset_cfg['name']}_{n}.wav" librosa.output.write_wav((dataset_dir / "Clean" / fname).as_posix(), clean, sr=16000) librosa.output.write_wav((dataset_dir / "Noisy" / fname).as_posix(), mixture, sr=16000) n += 1
def main(config): OUTPUT_DIR = Path(config["output_dir"]) SAMPLING_RATE = config["sampling_rate"] for j, dataset_cfg in enumerate(config["datasets"]): print(f"============ Building set {j + 1}: {dataset_cfg['name']} set ============") dataset_dir: Path = OUTPUT_DIR / dataset_cfg["name"] prepare_empty_dirs([dataset_dir]) """============ clean speeches ============""" clean_meta = dataset_cfg["clean"] clean_speech_paths = librosa.util.find_files( directory=clean_meta["database"], ext=clean_meta["ext"], recurse=clean_meta["recurse"], limit=None, offset=clean_meta["offset"] ) random.shuffle(clean_speech_paths) # 加载纯净语音时可以指定 minimum_sampling 参数,控制加载语音需要满足的最小采样点数 # 但在加载噪声时则没有这个参数。如果在合成带噪语音阶段发现噪声长度小于语音长度,则将噪声复制多次再合成带噪语音。 clean_ys = load_wavs( file_paths=clean_speech_paths, limit=clean_meta["limit"], sr=SAMPLING_RATE, minimum_sampling=clean_meta["minimum_sampling"], ) print("Loaded clean speeches.") """============ noise speeches ============""" noise_meta = dataset_cfg["noise"] noise_database_dir = Path(noise_meta["database"]) noise_ys = {} for noise_type in tqdm(noise_meta["types"], desc="Loading noise files"): noise_y, _ = librosa.load((noise_database_dir / (noise_type + ".wav")).as_posix(), sr=SAMPLING_RATE) noise_ys[noise_type] = noise_y print("Loaded noise.") """============ 合成 ============""" # 带噪 for i, SNR in enumerate(dataset_cfg["SNRs"]): store = {} clean_store = {} for j, clean_y in tqdm(enumerate(clean_ys, 1), desc="Add noise for clean waveform"): for noise_type in noise_ys.keys(): output_wav_basename_text = f"{str(j).zfill(4)}_{noise_type}" clean_y, noise_y = corrected_length( clean_y=clean_y, noise_y=noise_ys[noise_type] ) noisy_y = add_noise_for_waveform(clean_y, noise_y, int(SNR)) assert len(noisy_y) == len(clean_y) == len(noise_y) """ SNR == -5 是整个模型的输入,使用 7 帧 剩余的信噪比和纯净语音为模型训练的目标,使用 1 帧 """ if SNR == -5: tmp_lps = torch.Tensor(lps(noisy_y, pad=3).T).unfold(0, 7, 1) store[output_wav_basename_text] = tmp_lps.reshape(tmp_lps.shape[0], -1).numpy() else: store[output_wav_basename_text] = lps(noisy_y).T if i == 0: clean_store[output_wav_basename_text] = lps(clean_y).T print(f"Synthesize dB{SNR} finished,storing NPY file...") if clean_store: print("Saving clean NPY file...") np.save((dataset_dir / "clean.npy").as_posix(), clean_store) np.save((dataset_dir / f"dB{SNR}.npy").as_posix(), store)
def main(config, random_seed, dist): """ 构建时域上的语音增强数据集 Steps: 1. 加载纯净语音信号 2. 加载噪声文件 3. 在纯净语音信号上叠加噪声信号 4. 分别存储带噪语音与纯净语音 Args: config (dict): 配置信息 random_seed (int): 随机种子 dist (str): 输出结果的目录 Dataset: dataset_1/ mixture.npy clean.npy ... mixture.npy is { "0001_babble_-5": [signals, ...], "0001_babble_-10": [signals, ...], ... } clean.npy is { "0001": [signals, ...], "0002": [signals, ...], ... } """ np.random.seed(random_seed) dist_dir = Path(dist) # 以遍历的方式读取 config.json 中各个数据集的配置项 for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): dataset_dir = dist_dir / dataset_cfg["name"] prepare_empty_dirs([dataset_dir]) print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) # 加载纯净语音信号,存至 list 中 clean_cfg = dataset_cfg["clean"] clean_speech_paths = librosa.util.find_files( directory=clean_cfg["database"], ext=clean_cfg["ext"], recurse=clean_cfg["recurse"], limit=clean_cfg["limit"], offset=clean_cfg["offset"]) random.shuffle(clean_speech_paths) clean_ys = load_wavs( file_paths=clean_speech_paths, sr=clean_cfg["sampling_rate"], min_sampling=clean_cfg["min_sampling"], ) print("Loaded clean speeches.") # 加载噪声信号,存至 dict 中 noise_cfg = dataset_cfg["noise"] noise_database_dir = Path(noise_cfg["database"]) noise_ys = {} for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): mixture, _ = librosa.load( (noise_database_dir / (noise_type + ".wav")).as_posix(), sr=noise_cfg["sampling_rate"]) noise_ys[noise_type] = mixture print("Loaded noise.") # 合成带噪语音 mixture_store = {} clean_store = {} for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): num = str(i).zfill(4) for snr in dataset_cfg["snr"]: for noise_type in noise_ys.keys(): basename_text = f"{num}_{noise_type}_{snr}" clean, noise = corrected_the_length_of_noise_and_clean_speech( clean_y=clean, noise_y=noise_ys[noise_type]) mixture = add_noise_for_waveform(clean, noise, int(snr)) assert len(mixture) == len(clean) == len(noise) mixture_store[basename_text] = mixture # 基于一条纯净语音可以合成多种类型的带噪语音,但仅存储一份纯净语音 clean_store[num] = clean print(f"Synthesize finished,storing file...") joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix()) joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())