def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description= "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)." ) parser.add_argument( "--wav-scp", "--scp", default=None, type=str, help= "kaldi-style wav.scp file. you need to specify either scp or rootdir.") parser.add_argument( "--segments", default=None, type=str, help= "kaldi-style segments file. if use, you must to specify both scp and segments." ) parser.add_argument( "--rootdir", default=None, type=str, help= "directory including wav files. you need to specify either scp or rootdir." ) parser.add_argument("--dumpdir", type=str, required=True, help="directory to dump feature files.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") parser.add_argument("--num_spk", type=int, default=10, help="number of speakers ") parser.add_argument("--num_utt", type=int, default=20, help="number of speakers ") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('Skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.wav_scp is not None and args.rootdir is not None) or \ (args.wav_scp is None and args.rootdir is None): raise ValueError("Please specify either --rootdir or --wav-scp.") # get dataset if args.rootdir is not None: dataset = AudioDataset( args.rootdir, "*.wav", audio_load_fn=sf.read, return_utt_id=True, ) else: dataset = AudioSCPDataset( args.wav_scp, segments=args.segments, return_utt_id=True, return_sampling_rate=True, ) # check directly existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir, exist_ok=True) phase_map = {} # process each data for utt_id, (audio, fs) in tqdm(dataset): # check assert len(audio.shape) == 1, \ f"{utt_id} seems to be multi-channel signal." assert np.abs(audio).max() <= 1.0, \ f"{utt_id} seems to be different from 16 bit PCM." assert fs == config["sampling_rate"], \ f"{utt_id} seems to have a different sampling rate." spk_id = get_spk_id(utt_id) if spk_id not in phase_map and len(phase_map.keys()) >= args.num_spk: break if spk_id in phase_map and len(phase_map[spk_id]) > args.num_utt: continue # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"]) if "sampling_rate_for_feats" not in config: x = audio sampling_rate = config["sampling_rate"] hop_size = config["hop_size"] else: print('sampling_rate_for_feats: %d' % config["sampling_rate"]) # NOTE(kan-bayashi): this procedure enables to train the model with different # sampling rate for feature and audio, e.g., training with mel extracted # using 16 kHz audio and 24 kHz audio as a target waveform x = librosa.resample(audio, fs, config["sampling_rate_for_feats"]) sampling_rate = config["sampling_rate_for_feats"] assert config["hop_size"] * config["sampling_rate_for_feats"] % fs == 0, \ "hop_size must be int value. please check sampling_rate_for_feats is correct." hop_size = config["hop_size"] * config[ "sampling_rate_for_feats"] // fs # extract feature phase = get_angle(x, sampling_rate=sampling_rate, hop_size=hop_size, fft_size=config["fft_size"], win_length=config["win_length"], window=config["window"], num_mels=config["num_mels"], fmin=config["fmin"], fmax=config["fmax"]) phase = np.mean(phase, axis=0) if spk_id not in phase_map: phase_map[spk_id] = [phase] else: phase_map[spk_id].append(phase) import pdb pdb.set_trace() np.save(os.path.join(args.dumpdir, "angles.npy"), phase_map)
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description= "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)." ) parser.add_argument( "--wav-scp", "--scp", default=None, type=str, help= "kaldi-style wav.scp file. you need to specify either scp or rootdir.") parser.add_argument( "--segments", default=None, type=str, help= "kaldi-style segments file. if use, you must to specify both scp and segments." ) parser.add_argument( "--rootdir", default=None, type=str, help= "directory including wav files. you need to specify either scp or rootdir." ) parser.add_argument("--dumpdir", type=str, required=True, help="directory to dump feature files.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('Skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.wav_scp is not None and args.rootdir is not None) or \ (args.wav_scp is None and args.rootdir is None): raise ValueError("Please specify either --rootdir or --wav-scp.") # get dataset if args.rootdir is not None: dataset = AudioDataset( args.rootdir, "*.wav", audio_load_fn=sf.read, return_utt_id=True, ) else: dataset = AudioSCPDataset( args.wav_scp, segments=args.segments, return_utt_id=True, return_sampling_rate=True, ) # check directly existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir, exist_ok=True) # process each data for utt_id, (audio, fs) in tqdm(dataset): # check assert len(audio.shape) == 1, \ f"{utt_id} seems to be multi-channel signal." assert np.abs(audio).max() <= 1.0, \ f"{utt_id} seems to be different from 16 bit PCM." assert fs == config["sampling_rate"], \ f"{utt_id} seems to have a different sampling rate." # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"]) if "sampling_rate_for_feats" not in config: x = audio sampling_rate = config["sampling_rate"] hop_size = config["hop_size"] else: # NOTE(kan-bayashi): this procedure enables to train the model with different # sampling rate for feature and audio, e.g., training with mel extracted # using 16 kHz audio and 24 kHz audio as a target waveform x = librosa.resample(audio, fs, config["sampling_rate_for_feats"]) sampling_rate = config["sampling_rate_for_feats"] assert config["hop_size"] * config["sampling_rate_for_feats"] % fs == 0, \ "hop_size must be int value. please check sampling_rate_for_feats is correct." hop_size = config["hop_size"] * config[ "sampling_rate_for_feats"] // fs # extract feature mel = logmelfilterbank(x, sampling_rate=sampling_rate, hop_size=hop_size, fft_size=config["fft_size"], win_length=config["win_length"], window=config["window"], num_mels=config["num_mels"], fmin=config["fmin"], fmax=config["fmax"]) # make sure the audio length and feature length are matched audio = np.pad(audio, (0, config["fft_size"]), mode="reflect") audio = audio[:len(mel) * config["hop_size"]] assert len(mel) * config["hop_size"] == len(audio) # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() >= 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to re-consider global gain scale.") continue # save if config["format"] == "hdf5": write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave", audio.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32)) elif config["format"] == "npy": np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"), mel.astype(np.float32), allow_pickle=False) else: raise ValueError("support only hdf5 or npy format.")
def main(): parser = argparse.ArgumentParser(description="Preprocess audio and extract features (see detail in parallel_wavegan/bin/preprocess.py ") parser.add_argument("--wav-scp","--scp",default=None,type=str, help="kaldi-styke wav.scp file. you need to specify either scp or rootdir.") parser.add_argument("--segments",default=None,type=str, help="kaldi-style segments file. if use you must specify both scp and segments.") parser.add_argument("--rootdir",default=None,type=str, help="directory icluding wav files. you need to specify either scp or rootdir.") parser.add_argument("--dumpdir",type=str,required=True, help="directory to dump feature files.") parser.add_argument("--config",type=str,required=True, help="yaml format configuration file.") parser.add_argument("--verbose",type=int,default=1, help="logging level. higher is more logging.") args = parser.parse_args() # setting logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('skip DEBUG/INFO messages') # loading config with open(args.config) as f: config = yaml.load(f, Loader=yaml.load) config.update(vars(args)) # checking arguments if (args.wav_scp is not None and args.rootdir is not None) or \ (args.wav_scp is None and args.rootdir is None): raise ValueError("Please specify either --wav_scp or --rootdir") # getting dataset if args.rootdir is not None: dataset = AudioDataset( args.rootdir,"*.wav", audio_load_fn=sf.read, return_utt_id=True, ) else: dataset = AudioSCPDataset( args.wav_scp, segments=args.segments, return_utt_id=True, return_sampling_rate=True, ) # check directory existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir, exist_ok=True) # process each data for utt_id,(audio,fs) in tqdm(dataset): # checking assert len(audio.shape) == 1, f"{utt_id} is multichannel signal." assert np.abs(audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM." assert fs == config['sampling_rate'], f"{utt_id} has different sampling rate." # trim silence if config['trim_silence]']: audio,_ = librosa.effects.trim(audio, top_db=config['trim_threshold_in_db'], frame_length=config['trim_frame_size'], hop_length=config['trim_hop_size']) if "sampling_rate_for_feats" not in config: x = audio sampling_rate = config['sampling_rate'] hop_size = config['hop_size'] else: # here we can train model with different sampling rate for feature and audio x = librosa.resample(audio, fs, config['sampling_rate_for_feats']) sampling_rate = config['sampling_rate_for_feats'] assert config['hop_size'] * config['sampling_rate_for_feats'] % fs == 0, \ "hop_size must be int value. please check sampling_rate_for_feats is correct." hop_size = config['hop_size'] * config['samping_rate_for_feats'] // fs # extracting feature mel = logmelfilterbank(x, sampling_rate = sampling_rate, hop_size=hop_size, fft_size=config['fft_size'], win_length=config['win_length'], window=config['window'], num_mels=config['num_mels'], fmax=config['fmin'], fmax=config['fmax']) # making sure the audio length and feature length are matched audio = np.pad(audio, (0, config['fft_size']), mode="edge") audio = audio[:len(mel) * config['hop_size']] assert len(mel) * config['hop_size'] == len(audio) # apply global gain if config['global_gain_scale'] > 0.0: audio *= config['global_gain_scale'] if np.abs(audio).max() >= 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to reconsider global gain scale.") continue if config['format'] == "hdf5": write_hdf5(os.path.join(args.dumpdir,f"{utt_id}.h5"), "wave", audio.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32)) elif config['format'] == "npy": np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"), mel.astype(np.float32), allow_pickle=False) else: raise ValueError('support only hdf5 or npy format.')