コード例 #1
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description=
        "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
    )
    parser.add_argument(
        "--wav-scp",
        "--scp",
        default=None,
        type=str,
        help=
        "kaldi-style wav.scp file. you need to specify either scp or rootdir.")
    parser.add_argument(
        "--segments",
        default=None,
        type=str,
        help=
        "kaldi-style segments file. if use, you must to specify both scp and segments."
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help=
        "directory including wav files. you need to specify either scp or rootdir."
    )
    parser.add_argument("--dumpdir",
                        type=str,
                        required=True,
                        help="directory to dump feature files.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    parser.add_argument("--num_spk",
                        type=int,
                        default=10,
                        help="number of speakers ")
    parser.add_argument("--num_utt",
                        type=int,
                        default=20,
                        help="number of speakers ")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('Skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.wav_scp is not None and args.rootdir is not None) or \
            (args.wav_scp is None and args.rootdir is None):
        raise ValueError("Please specify either --rootdir or --wav-scp.")

    # get dataset
    if args.rootdir is not None:
        dataset = AudioDataset(
            args.rootdir,
            "*.wav",
            audio_load_fn=sf.read,
            return_utt_id=True,
        )
    else:
        dataset = AudioSCPDataset(
            args.wav_scp,
            segments=args.segments,
            return_utt_id=True,
            return_sampling_rate=True,
        )

    # check directly existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir, exist_ok=True)

    phase_map = {}
    # process each data
    for utt_id, (audio, fs) in tqdm(dataset):
        # check
        assert len(audio.shape) == 1, \
            f"{utt_id} seems to be multi-channel signal."
        assert np.abs(audio).max() <= 1.0, \
            f"{utt_id} seems to be different from 16 bit PCM."
        assert fs == config["sampling_rate"], \
            f"{utt_id} seems to have a different sampling rate."

        spk_id = get_spk_id(utt_id)
        if spk_id not in phase_map and len(phase_map.keys()) >= args.num_spk:
            break
        if spk_id in phase_map and len(phase_map[spk_id]) > args.num_utt:
            continue

        # trim silence
        if config["trim_silence"]:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"])

        if "sampling_rate_for_feats" not in config:
            x = audio
            sampling_rate = config["sampling_rate"]
            hop_size = config["hop_size"]
        else:
            print('sampling_rate_for_feats: %d' % config["sampling_rate"])
            # NOTE(kan-bayashi): this procedure enables to train the model with different
            #   sampling rate for feature and audio, e.g., training with mel extracted
            #   using 16 kHz audio and 24 kHz audio as a target waveform
            x = librosa.resample(audio, fs, config["sampling_rate_for_feats"])
            sampling_rate = config["sampling_rate_for_feats"]
            assert config["hop_size"] * config["sampling_rate_for_feats"] % fs == 0, \
                "hop_size must be int value. please check sampling_rate_for_feats is correct."
            hop_size = config["hop_size"] * config[
                "sampling_rate_for_feats"] // fs

        # extract feature
        phase = get_angle(x,
                          sampling_rate=sampling_rate,
                          hop_size=hop_size,
                          fft_size=config["fft_size"],
                          win_length=config["win_length"],
                          window=config["window"],
                          num_mels=config["num_mels"],
                          fmin=config["fmin"],
                          fmax=config["fmax"])
        phase = np.mean(phase, axis=0)
        if spk_id not in phase_map:
            phase_map[spk_id] = [phase]
        else:
            phase_map[spk_id].append(phase)

    import pdb
    pdb.set_trace()
    np.save(os.path.join(args.dumpdir, "angles.npy"), phase_map)
コード例 #2
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description=
        "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
    )
    parser.add_argument(
        "--wav-scp",
        "--scp",
        default=None,
        type=str,
        help=
        "kaldi-style wav.scp file. you need to specify either scp or rootdir.")
    parser.add_argument(
        "--segments",
        default=None,
        type=str,
        help=
        "kaldi-style segments file. if use, you must to specify both scp and segments."
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help=
        "directory including wav files. you need to specify either scp or rootdir."
    )
    parser.add_argument("--dumpdir",
                        type=str,
                        required=True,
                        help="directory to dump feature files.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('Skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.wav_scp is not None and args.rootdir is not None) or \
            (args.wav_scp is None and args.rootdir is None):
        raise ValueError("Please specify either --rootdir or --wav-scp.")

    # get dataset
    if args.rootdir is not None:
        dataset = AudioDataset(
            args.rootdir,
            "*.wav",
            audio_load_fn=sf.read,
            return_utt_id=True,
        )
    else:
        dataset = AudioSCPDataset(
            args.wav_scp,
            segments=args.segments,
            return_utt_id=True,
            return_sampling_rate=True,
        )

    # check directly existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir, exist_ok=True)

    # process each data
    for utt_id, (audio, fs) in tqdm(dataset):
        # check
        assert len(audio.shape) == 1, \
            f"{utt_id} seems to be multi-channel signal."
        assert np.abs(audio).max() <= 1.0, \
            f"{utt_id} seems to be different from 16 bit PCM."
        assert fs == config["sampling_rate"], \
            f"{utt_id} seems to have a different sampling rate."

        # trim silence
        if config["trim_silence"]:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"])

        if "sampling_rate_for_feats" not in config:
            x = audio
            sampling_rate = config["sampling_rate"]
            hop_size = config["hop_size"]
        else:
            # NOTE(kan-bayashi): this procedure enables to train the model with different
            #   sampling rate for feature and audio, e.g., training with mel extracted
            #   using 16 kHz audio and 24 kHz audio as a target waveform
            x = librosa.resample(audio, fs, config["sampling_rate_for_feats"])
            sampling_rate = config["sampling_rate_for_feats"]
            assert config["hop_size"] * config["sampling_rate_for_feats"] % fs == 0, \
                "hop_size must be int value. please check sampling_rate_for_feats is correct."
            hop_size = config["hop_size"] * config[
                "sampling_rate_for_feats"] // fs

        # extract feature
        mel = logmelfilterbank(x,
                               sampling_rate=sampling_rate,
                               hop_size=hop_size,
                               fft_size=config["fft_size"],
                               win_length=config["win_length"],
                               window=config["window"],
                               num_mels=config["num_mels"],
                               fmin=config["fmin"],
                               fmax=config["fmax"])

        # make sure the audio length and feature length are matched
        audio = np.pad(audio, (0, config["fft_size"]), mode="reflect")
        audio = audio[:len(mel) * config["hop_size"]]
        assert len(mel) * config["hop_size"] == len(audio)

        # apply global gain
        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]
        if np.abs(audio).max() >= 1.0:
            logging.warn(f"{utt_id} causes clipping. "
                         f"it is better to re-consider global gain scale.")
            continue

        # save
        if config["format"] == "hdf5":
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave",
                       audio.astype(np.float32))
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats",
                       mel.astype(np.float32))
        elif config["format"] == "npy":
            np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
                    audio.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
                    mel.astype(np.float32),
                    allow_pickle=False)
        else:
            raise ValueError("support only hdf5 or npy format.")
コード例 #3
0
ファイル: preprocess.py プロジェクト: MohamedAazizfcb/CRA
def main():
    parser = argparse.ArgumentParser(description="Preprocess audio and extract features (see detail in parallel_wavegan/bin/preprocess.py ")
    parser.add_argument("--wav-scp","--scp",default=None,type=str,
                        help="kaldi-styke wav.scp file. you need to specify either scp or rootdir.")
    parser.add_argument("--segments",default=None,type=str,
                        help="kaldi-style segments file. if use you must specify both scp and segments.")
    parser.add_argument("--rootdir",default=None,type=str,
                        help="directory icluding wav files. you need to specify either scp or rootdir.")
    parser.add_argument("--dumpdir",type=str,required=True,
                        help="directory to dump feature files.")
    parser.add_argument("--config",type=str,required=True,
                        help="yaml format configuration file.")
    parser.add_argument("--verbose",type=int,default=1,
                        help="logging level. higher is more logging.")
    args = parser.parse_args()


    # setting logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('skip DEBUG/INFO messages')


    # loading config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.load)
    config.update(vars(args))

    # checking arguments
    if (args.wav_scp is not None and args.rootdir is not None) or \
            (args.wav_scp is None and args.rootdir is None):
        raise ValueError("Please specify either --wav_scp or --rootdir")


    # getting dataset
    if args.rootdir is not None:
        dataset = AudioDataset(
            args.rootdir,"*.wav",
            audio_load_fn=sf.read,
            return_utt_id=True,
        )

    else:
        dataset = AudioSCPDataset(
            args.wav_scp,
            segments=args.segments,
            return_utt_id=True,
            return_sampling_rate=True,
        )



    # check directory existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir, exist_ok=True)

    # process each data
    for utt_id,(audio,fs) in tqdm(dataset):

        # checking
        assert len(audio.shape) == 1, f"{utt_id} is multichannel signal."
        assert np.abs(audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM."
        assert fs == config['sampling_rate'], f"{utt_id} has different sampling rate."

        # trim silence
        if config['trim_silence]']:
            audio,_ = librosa.effects.trim(audio,
                                           top_db=config['trim_threshold_in_db'],
                                           frame_length=config['trim_frame_size'],
                                           hop_length=config['trim_hop_size'])

        if "sampling_rate_for_feats" not in config:
            x = audio
            sampling_rate = config['sampling_rate']
            hop_size = config['hop_size']

        else: # here we can train model with different sampling rate for feature and audio
            x = librosa.resample(audio, fs, config['sampling_rate_for_feats'])
            sampling_rate = config['sampling_rate_for_feats']
            assert config['hop_size'] * config['sampling_rate_for_feats'] % fs == 0, \
                "hop_size must be int value. please check sampling_rate_for_feats is correct."
            hop_size = config['hop_size'] * config['samping_rate_for_feats'] // fs

        # extracting feature
        mel = logmelfilterbank(x,
                               sampling_rate = sampling_rate,
                               hop_size=hop_size,
                               fft_size=config['fft_size'],
                               win_length=config['win_length'],
                               window=config['window'],
                               num_mels=config['num_mels'],
                               fmax=config['fmin'],
                               fmax=config['fmax'])

        # making sure the audio length and feature length are matched
        audio = np.pad(audio, (0, config['fft_size']), mode="edge")
        audio = audio[:len(mel) * config['hop_size']]
        assert len(mel) * config['hop_size'] == len(audio)


        # apply global gain 
        if config['global_gain_scale'] > 0.0:
            audio *= config['global_gain_scale']

        if np.abs(audio).max() >= 1.0:
            logging.warn(f"{utt_id} causes clipping. "
                         f"it is better to reconsider global gain scale.")

            continue
                    
        if config['format'] == "hdf5":
            write_hdf5(os.path.join(args.dumpdir,f"{utt_id}.h5"), "wave", audio.astype(np.float32))
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32))

        elif config['format'] == "npy":
            np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
                    audio.astype(np.float32), allow_pickle=False)
            np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
                    mel.astype(np.float32), allow_pickle=False)

        else:
            raise ValueError('support only hdf5 or npy format.')