Esempio n. 1
0
def melcepstrum_extract(wav_list, args):
    """EXTRACT MEL CEPSTRUM."""
    # define feature extractor
    for i, wav_name in enumerate(wav_list):
        logging.info("now processing %s (%d/%d)" %
                     (wav_name, i + 1, len(wav_list)))

        # load wavfile and apply low cut filter
        fs, x = wavfile.read(wav_name)
        if x.dtype != np.int16:
            logging.warning("wav file format is not 16 bit PCM.")
        x = np.array(x, dtype=np.float64)
        if args.highpass_cutoff != 0:
            x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff)

        # check sampling frequency
        if not fs == args.fs:
            logging.error("sampling frequency is not matched.")
            sys.exit(1)

        # extract features
        shiftl = int(args.shiftms * fs * 0.001)
        mcep = stft_mcep(x, args.fftl, shiftl, args.mcep_dim, args.mcep_alpha)

        # save to hdf5
        hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(
            ".wav", ".h5")
        write_hdf5(hdf5name, "/mcep", np.float32(mcep))

        # overwrite wav file
        if args.highpass_cutoff != 0 and args.save_wav:
            wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs,
                          np.int16(x))
Esempio n. 2
0
def world_feature_extract(wav_list, args):
    """EXTRACT WORLD FEATURE VECTOR."""
    # define feature extractor
    feature_extractor = FeatureExtractor(analyzer="world",
                                         fs=args.fs,
                                         shiftms=args.shiftms,
                                         minf0=args.minf0,
                                         maxf0=args.maxf0,
                                         fftl=args.fftl)

    for i, wav_name in enumerate(wav_list):
        logging.info("now processing %s (%d/%d)" %
                     (wav_name, i + 1, len(wav_list)))

        # load wavfile and apply low cut filter
        fs, x = wavfile.read(wav_name)
        if x.dtype != np.int16:
            logging.warning("wav file format is not 16 bit PCM.")
        x = np.array(x, dtype=np.float64)
        if args.highpass_cutoff != 0:
            x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff)

        # check sampling frequency
        if not fs == args.fs:
            logging.error("sampling frequency is not matched.")
            sys.exit(1)

        # extract features
        f0, _, _ = feature_extractor.analyze(x)
        uv, cont_f0 = convert_continuos_f0(f0)
        cont_f0_lpf = low_pass_filter(cont_f0,
                                      int(1.0 / (args.shiftms * 0.001)),
                                      cutoff=20)
        codeap = feature_extractor.codeap()
        mcep = feature_extractor.mcep(dim=args.mcep_dim, alpha=args.mcep_alpha)

        # concatenate
        cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1)
        uv = np.expand_dims(uv, axis=-1)
        feats = np.concatenate([uv, cont_f0_lpf, mcep, codeap], axis=1)

        # save to hdf5
        hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(
            ".wav", ".h5")
        write_hdf5(hdf5name, "/world", feats)

        # overwrite wav file
        if args.highpass_cutoff != 0 and args.save_wav:
            wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs,
                          np.int16(x))
def test_preprocessing(feature_type):
    # make arguments
    args = make_args(feature_type=feature_type)

    # prepare dummy wav files
    wavdir = "tmp/wav"
    if not os.path.exists(wavdir):
        os.makedirs(wavdir)
    for i in range(5):
        make_dummy_wav(wavdir + "/%d.wav" % i, 8000, args.fs)

    # feature extract
    wav_list = find_files(wavdir, "*.wav")
    if not os.path.exists(args.wavdir):
        os.makedirs(args.wavdir)
    if args.feature_type == "world":
        world_feature_extract(wav_list, args)
    elif args.feature_type == "melspc":
        melspectrogram_extract(wav_list, args)
    else:
        melcepstrum_extract(wav_list, args)

    # calc_stats
    file_list = find_files(args.hdf5dir, "*.h5")
    calc_stats(file_list, args)

    # noise shaping
    if feature_type != "melspc":
        wav_list = find_files(args.wavdir, "*.wav")
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
        if not check_hdf5(args.stats, "/mlsa/coef"):
            avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean")
            if args.feature_type == "world":
                avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end]
            mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag,
                                                  args.mcep_alpha)
            write_hdf5(args.stats, "/mlsa/coef", mlsa_coef)
            write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha)
        noise_shaping(wav_list, args)

    # remove
    shutil.rmtree("tmp")
def melspectrogram_extract(wav_list, args):
    """EXTRACT MEL SPECTROGRAM."""
    # define feature extractor
    for i, wav_name in enumerate(wav_list):
        logging.info("now processing %s (%d/%d)" %
                     (wav_name, i + 1, len(wav_list)))

        # load wavfile and apply low cut filter
        fs, x = wavfile.read(wav_name)
        if x.dtype != np.int16:
            logging.warning("wav file format is not 16 bit PCM.")
        x = np.array(x, dtype=np.float64)
        if args.highpass_cutoff != 0:
            x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff)

        # check sampling frequency
        if not fs == args.fs:
            logging.error("sampling frequency is not matched.")
            sys.exit(1)

        # extract features
        x_norm = x / (np.iinfo(np.int16).max + 1)
        shiftl = int(args.shiftms * fs * 0.001)
        mspc = librosa.feature.melspectrogram(
            x_norm,
            fs,
            n_fft=args.fftl,
            hop_length=shiftl,
            n_mels=args.mspc_dim,
            fmin=args.fmin if args.fmin is not None else 0,
            fmax=args.fmax if args.fmax is not None else fs // 2,
            power=1.0)
        mspc = np.log10(np.maximum(EPS, mspc.T))

        # save to hdf5
        hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(
            ".wav", ".h5")
        write_hdf5(hdf5name, "/melspc", np.float32(mspc))

        # overwrite wav file
        if args.highpass_cutoff != 0 and args.save_wav:
            wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs,
                          np.int16(x))
Esempio n. 5
0
def calc_stats(file_list, args):
    """CALCULATE STATISTICS."""
    scaler = StandardScaler()

    # process over all of data
    for i, filename in enumerate(file_list):
        logging.info("now processing %s (%d/%d)" %
                     (filename, i + 1, len(file_list)))
        feat = read_hdf5(filename, "/" + args.feature_type)
        scaler.partial_fit(feat)

    # add uv term
    mean = scaler.mean_
    scale = scaler.scale_
    if args.feature_type == "world":
        mean[0] = 0.0
        scale[0] = 1.0

    # write to hdf5
    write_hdf5(args.stats, "/" + args.feature_type + "/mean", np.float32(mean))
    write_hdf5(args.stats, "/" + args.feature_type + "/scale",
               np.float32(scale))
def main():
    """RUN NOISE SHAPING IN PARALLEL."""
    parser = argparse.ArgumentParser(
        description="making feature file argsurations.")

    parser.add_argument(
        "--waveforms", default=None,
        help="directory or list of filename of input wavfile")
    parser.add_argument(
        "--stats", default=None,
        help="filename of hdf5 format")
    parser.add_argument(
        "--outdir", default=None,
        help="directory to save preprocessed wav file")
    parser.add_argument(
        "--fs", default=16000,
        type=int, help="Sampling frequency")
    parser.add_argument(
        "--shiftms", default=5,
        type=float, help="Frame shift in msec")
    parser.add_argument(
        "--feature_type", default="world", choices=["world", "mcep", "melspc"],
        type=str, help="feature type")
    parser.add_argument(
        "--mcep_dim_start", default=2,
        type=int, help="Start index of mel cepstrum")
    parser.add_argument(
        "--mcep_dim_end", default=27,
        type=int, help="End index of mel cepstrum")
    parser.add_argument(
        "--mcep_alpha", default=0.41,
        type=float, help="Alpha of mel cepstrum")
    parser.add_argument(
        "--mag", default=0.5,
        type=float, help="magnification of noise shaping")
    parser.add_argument(
        "--verbose", default=1,
        type=int, help="log message level")
    parser.add_argument(
        '--n_jobs', default=10,
        type=int, help="number of parallel jobs")
    parser.add_argument(
        '--inv', default=False, type=strtobool,
        help="if True, inverse filtering will be performed")

    args = parser.parse_args()

    # set log level
    if args.verbose == 1:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(level=logging.WARNING,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warning("logging is disabled.")

    # show arguments
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))

    # read list
    if os.path.isdir(args.waveforms):
        file_list = sorted(find_files(args.waveforms, "*.wav"))
    else:
        file_list = read_txt(args.waveforms)
    logging.info("number of utterances = %d" % len(file_list))

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # divide list
    file_lists = np.array_split(file_list, args.n_jobs)
    file_lists = [f_list.tolist() for f_list in file_lists]

    # calculate MLSA coef ans save it
    if not check_hdf5(args.stats, "/mlsa/coef"):
        avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean")
        if args.feature_type == "world":
            avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end]
        mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha)
        write_hdf5(args.stats, "/mlsa/coef", mlsa_coef)
        write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha)

    # multi processing
    processes = []
    if args.feature_type == "melspc":
        # TODO(kan-bayashi): implement noise shaping using melspectrogram
        raise NotImplementedError("currently, support only world and mcep.")
    for f in file_lists:
        p = mp.Process(target=noise_shaping, args=(f, args,))
        p.start()
        processes.append(p)

    # wait for all process
    for p in processes:
        p.join()