def _process_single_file(data): # parse inputs for each audio audio_name, mel_name, audio, mel = data # normalize """Scale features of X according to feature_range. mel *= self.scale_ mel += self.min_ """ mel = scaler.transform(mel) # save if config["format"] == "hdf5": write_hdf5( os.path.join(args.dumpdir, f"{os.path.basename(audio_name)}"), "wave", audio.astype(np.float32)) write_hdf5( os.path.join(args.dumpdir, f"{os.path.basename(mel_name)}"), "feats", mel.astype(np.float32)) elif config["format"] == "npy": np.save(os.path.join(args.dumpdir, f"{os.path.basename(audio_name)}"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.dumpdir, f"{os.path.basename(mel_name)}"), mel.astype(np.float32), allow_pickle=False) else: raise ValueError("support only hdf5 or npy format.")
def _process_single_file(data): # parse inputs if args.scp is not None: utt_id, (fs, audio) = data audio = audio.astype(np.float32) audio /= (1 << (16 - 1)) # assume that wav is PCM 16 bit else: name, (audio, fs) = data utt_id = os.path.basename(name).replace(".wav", "") # check assert len(audio.shape) == 1, \ f"{utt_id} seems to be multi-channel signal." assert fs == config["sampling_rate"], \ f"{utt_id} seems to have a different sampling rate." assert np.abs(audio).max() <= 1.0, \ f"{utt_id} seems to be different from 16 bit PCM." # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim(audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"]) # extract feature mel = logmelfilterbank(audio, fs, fft_size=config["fft_size"], hop_size=config["hop_size"], win_length=config["win_length"], window=config["window"], num_mels=config["num_mels"], fmin=config["fmin"], fmax=config["fmax"]) # make sure the audio length and feature length are matched audio = np.pad(audio, (0, config["fft_size"]), mode="edge") audio = audio[:len(mel) * config["hop_size"]] assert len(mel) * config["hop_size"] == len(audio) # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() > 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to re-consider global gain scale.") return # save if config["format"] == "hdf5": write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave", audio.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32)) elif config["format"] == "npy": np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"), mel.astype(np.float32), allow_pickle=False) else: raise ValueError("support only hdf5 or npy format.")
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description= "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)." ) parser.add_argument( "--wav-scp", "--scp", default=None, type=str, help= "kaldi-style wav.scp file. you need to specify either scp or rootdir.") parser.add_argument( "--segments", default=None, type=str, help= "kaldi-style segments file. if use, you must to specify both scp and segments." ) parser.add_argument( "--rootdir", default=None, type=str, help= "directory including wav files. you need to specify either scp or rootdir." ) parser.add_argument("--dumpdir", type=str, required=True, help="directory to dump feature files.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('Skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.wav_scp is not None and args.rootdir is not None) or \ (args.wav_scp is None and args.rootdir is None): raise ValueError("Please specify either --rootdir or --wav-scp.") # get dataset if args.rootdir is not None: dataset = AudioDataset( args.rootdir, "*.wav", audio_load_fn=sf.read, return_utt_id=True, ) else: dataset = AudioSCPDataset( args.wav_scp, segments=args.segments, return_utt_id=True, return_sampling_rate=True, ) # check directly existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir, exist_ok=True) # process each data for utt_id, (audio, fs) in tqdm(dataset): # check assert len(audio.shape) == 1, \ f"{utt_id} seems to be multi-channel signal." assert np.abs(audio).max() <= 1.0, \ f"{utt_id} seems to be different from 16 bit PCM." assert fs == config["sampling_rate"], \ f"{utt_id} seems to have a different sampling rate." # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"]) if "sampling_rate_for_feats" not in config: x = audio sampling_rate = config["sampling_rate"] hop_size = config["hop_size"] else: # NOTE(kan-bayashi): this procedure enables to train the model with different # sampling rate for feature and audio, e.g., training with mel extracted # using 16 kHz audio and 24 kHz audio as a target waveform x = librosa.resample(audio, fs, config["sampling_rate_for_feats"]) sampling_rate = config["sampling_rate_for_feats"] assert config["hop_size"] * config["sampling_rate_for_feats"] % fs == 0, \ "hop_size must be int value. please check sampling_rate_for_feats is correct." hop_size = config["hop_size"] * config[ "sampling_rate_for_feats"] // fs # extract feature mel = logmelfilterbank(x, sampling_rate=sampling_rate, hop_size=hop_size, fft_size=config["fft_size"], win_length=config["win_length"], window=config["window"], num_mels=config["num_mels"], fmin=config["fmin"], fmax=config["fmax"]) # make sure the audio length and feature length are matched audio = np.pad(audio, (0, config["fft_size"]), mode="reflect") audio = audio[:len(mel) * config["hop_size"]] assert len(mel) * config["hop_size"] == len(audio) # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() >= 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to re-consider global gain scale.") continue # save if config["format"] == "hdf5": write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave", audio.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32)) elif config["format"] == "npy": np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"), mel.astype(np.float32), allow_pickle=False) else: raise ValueError("support only hdf5 or npy format.")
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py).") parser.add_argument("--rootdir", default=None, type=str, help="directory including feature files to be normalized. " "you need to specify either *-scp or rootdir.") parser.add_argument("--wav-scp", default=None, type=str, help="kaldi-style wav.scp file. " "you need to specify either *-scp or rootdir.") parser.add_argument("--feats-scp", default=None, type=str, help="kaldi-style feats.scp file. " "you need to specify either *-scp or rootdir.") parser.add_argument("--segments", default=None, type=str, help="kaldi-style segments file.") parser.add_argument("--dumpdir", type=str, required=True, help="directory to dump normalized feature files.") parser.add_argument("--stats", type=str, required=True, help="statistics file.") parser.add_argument("--skip-wav-copy", default=False, action="store_true", help="whether to skip the copy of wav files.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument("--ftype", default='mel', type=str, help="feature type") parser.add_argument("--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") # runtime mode args = parser.parse_args() # interactive mode # args = argparse.ArgumentParser() # args.wav_scp = None # args.feats_scp = None # args.segment = None # args.dumpdir = "" # args.skip_wav_copy = True # args.config = 'egs/so_emo_female/voc1/conf/multi_band_melgan.v2.yaml' # args.ftype = 'spec' # args.verbose = 1 # args.rootdir = '/data/evs/VCTK/VCTK-wgan/spec' # args.stats = '/data/evs/VCTK/VCTK-wgan/spec/mel_mean_std.npy' # args.rootdir = '/data/evs/Arctic/spec' # args.stats = '/data/evs/Arctic/spec/spec_mean_std.npy' # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('Skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.feats_scp is not None and args.rootdir is not None) or \ (args.feats_scp is None and args.rootdir is None): raise ValueError("Please specify either --rootdir or --feats-scp.") # check directory existence if args.dumpdir != "": if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir, exist_ok=True) # get dataset if args.rootdir is not None: if config["format"] == "hdf5": audio_query, mel_query = "*.h5", "*.h5" audio_load_fn = lambda x: read_hdf5(x, "wave") # NOQA mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": audio_query, mel_query, spc_query = "*.wav.npy", "*.mel.npy", "*.spec.npy" audio_load_fn = np.load mel_load_fn = np.load spc_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") if not args.skip_wav_copy: dataset = AudioMelDataset( root_dir=args.rootdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, return_utt_id=True, ) else: dataset1 = MelDatasetNew( root_dir=args.rootdir, mel_query=mel_query, mel_load_fn=mel_load_fn, return_utt_id=True, ) dataset2 = SpcDatasetNew( root_dir=args.rootdir, spc_query=spc_query, spc_load_fn=spc_load_fn, return_utt_id=True, ) else: if not args.skip_wav_copy: dataset = AudioMelSCPDataset( wav_scp=args.wav_scp, feats_scp=args.feats_scp, segments=args.segments, return_utt_id=True, ) else: dataset = MelSCPDataset( feats_scp=args.feats_scp, return_utt_id=True, ) logging.info(f"The number of files in mel dataset = {len(dataset1)}.") logging.info(f"The number of files in spc dataset = {len(dataset2)}.") # restore scaler scaler = StandardScaler() if config["format"] == "hdf5": scaler.mean_ = read_hdf5(args.stats, "mean") scaler.scale_ = read_hdf5(args.stats, "scale") elif config["format"] == "npy": scaler.mean_ = np.load(args.stats)[0] scaler.scale_ = np.load(args.stats)[1] else: raise ValueError("support only hdf5 or npy format.") # from version 0.23.0, this information is needed scaler.n_features_in_ = scaler.mean_.shape[0] # process each file if args.ftype == 'mel': dataset = dataset1 elif args.ftype == 'spec': dataset = dataset2 for items in tqdm(dataset): if not args.skip_wav_copy: utt_id, audio, feat = items else: utt_id, feat, feat_file = items # normalize feat = scaler.transform(feat) # feat = (feat - scaler.mean_) / scaler.scale_ # this is identical to scaler.transform(feat) # save if config["format"] == "hdf5": write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", feat.astype(np.float32)) if not args.skip_wav_copy: write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave", audio.astype(np.float32)) elif config["format"] == "npy": if args.dumpdir == "": feat_file = feat_file.replace('.npy', '') np.save((feat_file + "-norm.npy"), feat.astype(np.float32), allow_pickle=False) if not args.skip_wav_copy: print("Please include --skip_wav_copy in arguments") else: np.save(os.path.join(args.dumpdir, f"{utt_id}.npy"), feat.astype(np.float32), allow_pickle=False) if not args.skip_wav_copy: np.save(os.path.join(args.dumpdir, f"{utt_id}.wav.npy"), audio.astype(np.float32), allow_pickle=False) else: raise ValueError("support only hdf5 or npy format.")
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)." ) parser.add_argument( "--rootdir", default=None, type=str, help="directory including feature files to be normalized. " "you need to specify either *-scp or rootdir.", ) parser.add_argument( "--wav-scp", default=None, type=str, help="kaldi-style wav.scp file. " "you need to specify either *-scp or rootdir.", ) parser.add_argument( "--feats-scp", default=None, type=str, help="kaldi-style feats.scp file. " "you need to specify either *-scp or rootdir.", ) parser.add_argument( "--segments", default=None, type=str, help="kaldi-style segments file.", ) parser.add_argument( "--dumpdir", type=str, required=True, help="directory to dump normalized feature files.", ) parser.add_argument( "--stats", type=str, required=True, help="statistics file.", ) parser.add_argument( "--skip-wav-copy", default=False, action="store_true", help="whether to skip the copy of wav files.", ) parser.add_argument( "--config", type=str, required=True, help="yaml format configuration file." ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.feats_scp is not None and args.rootdir is not None) or ( args.feats_scp is None and args.rootdir is None ): raise ValueError("Please specify either --rootdir or --feats-scp.") # check directory existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir) # get dataset if args.rootdir is not None: if config["format"] == "hdf5": audio_query, mel_query = "*.h5", "*.h5" audio_load_fn = lambda x: read_hdf5(x, "wave") # NOQA mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": audio_query, mel_query = "*-wave.npy", "*-feats.npy" audio_load_fn = np.load mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") if not args.skip_wav_copy: dataset = AudioMelDataset( root_dir=args.rootdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, return_utt_id=True, ) else: dataset = MelDataset( root_dir=args.rootdir, mel_query=mel_query, mel_load_fn=mel_load_fn, return_utt_id=True, ) else: if not args.skip_wav_copy: dataset = AudioMelSCPDataset( wav_scp=args.wav_scp, feats_scp=args.feats_scp, segments=args.segments, return_utt_id=True, ) else: dataset = MelSCPDataset( feats_scp=args.feats_scp, return_utt_id=True, ) logging.info(f"The number of files = {len(dataset)}.") # restore scaler scaler = StandardScaler() if config["format"] == "hdf5": scaler.mean_ = read_hdf5(args.stats, "mean") scaler.scale_ = read_hdf5(args.stats, "scale") elif config["format"] == "npy": scaler.mean_ = np.load(args.stats)[0] scaler.scale_ = np.load(args.stats)[1] else: raise ValueError("support only hdf5 or npy format.") # from version 0.23.0, this information is needed scaler.n_features_in_ = scaler.mean_.shape[0] # process each file for items in tqdm(dataset): if not args.skip_wav_copy: utt_id, audio, mel = items else: utt_id, mel = items # normalize mel = scaler.transform(mel) # save if config["format"] == "hdf5": write_hdf5( os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32), ) if not args.skip_wav_copy: write_hdf5( os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave", audio.astype(np.float32), ) elif config["format"] == "npy": np.save( os.path.join(args.dumpdir, f"{utt_id}-feats.npy"), mel.astype(np.float32), allow_pickle=False, ) if not args.skip_wav_copy: np.save( os.path.join(args.dumpdir, f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False, ) else: raise ValueError("support only hdf5 or npy format.")
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of dumped raw features " "(See detail in parallel_wavegan/bin/compute_statistics.py).") parser.add_argument( "--feats-scp", "--scp", default=None, type=str, help="kaldi-style feats.scp file. " "you need to specify either feats-scp or rootdir.", ) parser.add_argument( "--rootdir", type=str, help="directory including feature files. " "you need to specify either feats-scp or rootdir.", ) parser.add_argument( "--config", type=str, required=True, help="yaml format configuration file.", ) parser.add_argument( "--dumpdir", default=None, type=str, required=True, help="directory to save statistics. if not provided, " "stats will be saved in the above root directory. (default=None)", ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.feats_scp is not None and args.rootdir is not None) or (args.feats_scp is None and args.rootdir is None): raise ValueError("Please specify either --rootdir or --feats-scp.") # check directory existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir) # get dataset if args.feats_scp is None: if config["format"] == "hdf5": mel_query = "*.h5" mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": mel_query = "*-feats.npy" mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset = MelDataset(args.rootdir, mel_query=mel_query, mel_load_fn=mel_load_fn) else: dataset = MelSCPDataset(args.feats_scp) logging.info(f"The number of files = {len(dataset)}.") # calculate statistics scaler = StandardScaler() for mel in tqdm(dataset): scaler.partial_fit(mel) if config["format"] == "hdf5": write_hdf5( os.path.join(args.dumpdir, "stats.h5"), "mean", scaler.mean_.astype(np.float32), ) write_hdf5( os.path.join(args.dumpdir, "stats.h5"), "scale", scaler.scale_.astype(np.float32), ) else: stats = np.stack([scaler.mean_, scaler.scale_], axis=0) np.save( os.path.join(args.dumpdir, "stats.npy"), stats.astype(np.float32), allow_pickle=False, )
dataname = 'man1028' data_dir = '/workspace/ssd3/train_pwg_pindao/1' root_dir = '/workspace/pwg/egs/csmsc/voc1/dump_man1028/' datasets = ['train_nodev', 'dev', 'eval'] mels_dir = f"{data_dir}/mels" wave_dir = f"{data_dir}/audio" dev_num = 50 eval_num = 50 num_split = 16 mels = glob(f"{mels_dir}/*.npy") mels_dataset = [ mels[:len(mels) - dev_num - eval_num], mels[-dev_num - eval_num:-eval_num], mels[-eval_num:] ] for ind, dataset in enumerate(datasets): for cnt, melnpy in tqdm(enumerate(mels_dataset[ind])): job = (cnt % num_split) + 1 dump_dir = f'{root_dir}/{dataset}/norm/dump.{job}' os.makedirs(dump_dir, exist_ok=True) utt_id = '_'.join(melnpy.split('.')[0].split('-')[1:]) utt_id = dataname + utt_id wavenpy = melnpy.replace('mels', 'audio').replace('mel', 'audio') mel = np.load(melnpy) audio = np.load(wavenpy) assert audio.shape[0] == mel.shape[0] * 300 write_hdf5(os.path.join(dump_dir, f"{utt_id}.h5"), "wave", audio.astype(np.float32)) write_hdf5(os.path.join(dump_dir, f"{utt_id}.h5"), "feats", mel.astype(np.float32))
import sys sys.path.insert(0, '../../../') import os from parallel_wavegan.utils import write_hdf5 import numpy as np inputnpy=sys.argv[1] if os.path.isdir(inputnpy): for filebase in os.listdir(inputnpy): if not filebase.endswith('.npy'): continue dirname=inputnpy filename=filebase.split('.')[0] print(os.path.join(dirname, filebase)) mel=np.load(os.path.join(dirname, filebase)) write_hdf5(os.path.join(dirname, f"{filename}.h5"), "feats", mel.astype(np.float32)) else: dirname=os.path.dirname(inputnpy) filename=os.path.basename(inputnpy).split('.')[0] mel=np.load(inputnpy) write_hdf5(os.path.join(dirname, f"{filename}.h5"), "feats", mel.astype(np.float32))
def main(): parser = argparse.ArgumentParser(description="Preprocess audio and extract features (see detail in parallel_wavegan/bin/preprocess.py ") parser.add_argument("--wav-scp","--scp",default=None,type=str, help="kaldi-styke wav.scp file. you need to specify either scp or rootdir.") parser.add_argument("--segments",default=None,type=str, help="kaldi-style segments file. if use you must specify both scp and segments.") parser.add_argument("--rootdir",default=None,type=str, help="directory icluding wav files. you need to specify either scp or rootdir.") parser.add_argument("--dumpdir",type=str,required=True, help="directory to dump feature files.") parser.add_argument("--config",type=str,required=True, help="yaml format configuration file.") parser.add_argument("--verbose",type=int,default=1, help="logging level. higher is more logging.") args = parser.parse_args() # setting logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('skip DEBUG/INFO messages') # loading config with open(args.config) as f: config = yaml.load(f, Loader=yaml.load) config.update(vars(args)) # checking arguments if (args.wav_scp is not None and args.rootdir is not None) or \ (args.wav_scp is None and args.rootdir is None): raise ValueError("Please specify either --wav_scp or --rootdir") # getting dataset if args.rootdir is not None: dataset = AudioDataset( args.rootdir,"*.wav", audio_load_fn=sf.read, return_utt_id=True, ) else: dataset = AudioSCPDataset( args.wav_scp, segments=args.segments, return_utt_id=True, return_sampling_rate=True, ) # check directory existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir, exist_ok=True) # process each data for utt_id,(audio,fs) in tqdm(dataset): # checking assert len(audio.shape) == 1, f"{utt_id} is multichannel signal." assert np.abs(audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM." assert fs == config['sampling_rate'], f"{utt_id} has different sampling rate." # trim silence if config['trim_silence]']: audio,_ = librosa.effects.trim(audio, top_db=config['trim_threshold_in_db'], frame_length=config['trim_frame_size'], hop_length=config['trim_hop_size']) if "sampling_rate_for_feats" not in config: x = audio sampling_rate = config['sampling_rate'] hop_size = config['hop_size'] else: # here we can train model with different sampling rate for feature and audio x = librosa.resample(audio, fs, config['sampling_rate_for_feats']) sampling_rate = config['sampling_rate_for_feats'] assert config['hop_size'] * config['sampling_rate_for_feats'] % fs == 0, \ "hop_size must be int value. please check sampling_rate_for_feats is correct." hop_size = config['hop_size'] * config['samping_rate_for_feats'] // fs # extracting feature mel = logmelfilterbank(x, sampling_rate = sampling_rate, hop_size=hop_size, fft_size=config['fft_size'], win_length=config['win_length'], window=config['window'], num_mels=config['num_mels'], fmax=config['fmin'], fmax=config['fmax']) # making sure the audio length and feature length are matched audio = np.pad(audio, (0, config['fft_size']), mode="edge") audio = audio[:len(mel) * config['hop_size']] assert len(mel) * config['hop_size'] == len(audio) # apply global gain if config['global_gain_scale'] > 0.0: audio *= config['global_gain_scale'] if np.abs(audio).max() >= 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to reconsider global gain scale.") continue if config['format'] == "hdf5": write_hdf5(os.path.join(args.dumpdir,f"{utt_id}.h5"), "wave", audio.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32)) elif config['format'] == "npy": np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"), mel.astype(np.float32), allow_pickle=False) else: raise ValueError('support only hdf5 or npy format.')
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of dumped raw features " "(See detail in parallel_wavegan/bin/compute_statistics.py).") parser.add_argument("--feats-scp", "--scp", default=None, type=str, help="kaldi-style feats.scp file. " "you need to specify either feats-scp or rootdir.") parser.add_argument("--rootdir", type=str, required=True, help="directory including feature files. " "you need to specify either feats-scp or rootdir.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument("--dumpdir", default=None, type=str, help="directory to save statistics. if not provided, " "stats will be saved in the above root directory. (default=None)") parser.add_argument("--ftype", default='mel', type=str, help="feature type") parser.add_argument("--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") # runtime mode args = parser.parse_args() # interactive mode # args = argparse.ArgumentParser() # args.feats_scp = None # args.config = 'egs/so_emo_female/voc1/conf/multi_band_melgan.v2.yaml' # args.verbose = 1 # args.ftype = 'spec' # args.rootdir = '/data/evs/VCTK/VCTK-wgan/spec' # args.rootdir = '/data/evs/Arctic/spec' # args.dumpdir = os.path.join(args.rootdir, "") # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('Skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.feats_scp is not None and args.rootdir is not None) or \ (args.feats_scp is None and args.rootdir is None): raise ValueError("Please specify either --rootdir or --feats-scp.") # check directory existence if args.dumpdir is None: args.dumpdir = os.path.dirname(args.rootdir) if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir) # get dataset if args.feats_scp is None: if config["format"] == "hdf5": mel_query = "*.h5" mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": mel_query = "*.mel.npy" mel_load_fn = np.load spc_query = "*.spec.npy" spc_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset1 = MelDataset( args.rootdir, mel_query=mel_query, mel_load_fn=mel_load_fn) dataset2 = SpcDataset( args.rootdir, spc_query=spc_query, spc_load_fn=spc_load_fn) else: dataset = MelSCPDataset(args.feats_scp) logging.info(f"The number of files in mel dataset = {len(dataset1)}.") logging.info(f"The number of files in spc dataset = {len(dataset2)}.") # calculate statistics scaler = StandardScaler() if args.ftype == 'mel': for mel in tqdm(dataset1): scaler.partial_fit(mel) elif args.ftype == 'spec': for spc in tqdm(dataset2): scaler.partial_fit(spc) if config["format"] == "hdf5": write_hdf5(os.path.join(args.dumpdir, "{}_mean_std.h5".format(args.ftype)), "mean", scaler.mean_.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, "{}_mean_std.h5".format(args.ftype)), "scale", scaler.scale_.astype(np.float32)) else: stats = np.stack([scaler.mean_, scaler.scale_], axis=0) np.save(os.path.join(args.dumpdir, "{}_mean_std.npy".format(args.ftype)), stats.astype(np.float32), allow_pickle=False)
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of dumped raw features.") parser.add_argument("--rootdir", default=None, type=str, required=True, help="Direcotry including feature files.") parser.add_argument("--dumpdir", default=None, type=str, help="Direcotry to save statistics.") parser.add_argument("--config", default="hparam.yml", type=str, required=True, help="Yaml format configuration file.") parser.add_argument("--verbose", type=int, default=1, help="logging level (higher is more logging)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check direcotry existence if args.dumpdir is None: args.dumpdir = os.path.dirname(args.rootdir) if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir) # get dataset if config["format"] == "hdf5": mel_query = "*.h5" mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": mel_query = "*-feats.npy" mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset = MelDataset(args.rootdir, mel_query=mel_query, mel_load_fn=mel_load_fn) logging.info(f"the number of files = {len(dataset)}.") # calculate statistics scaler = StandardScaler() for mel in tqdm(dataset): scaler.partial_fit(mel) if config["format"] == "hdf5": write_hdf5(os.path.join(args.dumpdir, "stats.h5"), "mean", scaler.mean_.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, "stats.h5"), "scale", scaler.scale_.astype(np.float32)) else: stats = np.stack([scaler.mean_, scaler.scale_], axis=0) np.save(os.path.join(args.dumpdir, "stats.npy"), stats.astype(np.float32), allow_pickle=False)