Example #1
0
def compute_statistics():
    """Compute mean / std statistics of some features for later normalization."""
    config = parse_and_config()

    # find features files for the train split
    glob_fn = lambda x: glob.glob(
        os.path.join(config["rootdir"], "train", x, "*.npy"))
    glob_mel = glob_fn("raw-feats")
    glob_f0 = glob_fn("raw-f0")
    glob_energy = glob_fn("raw-energies")
    assert (
        len(glob_mel) == len(glob_f0) == len(glob_energy)
    ), "Features, f0 and energies have different files in training split."

    logging.info(f"Computing statistics for {len(glob_mel)} files.")
    # init scaler for multiple features
    scaler_mel = StandardScaler(copy=False)
    scaler_energy = StandardScaler(copy=False)
    scaler_f0 = StandardScaler(copy=False)

    for mel, f0, energy in tqdm(zip(glob_mel, glob_f0, glob_energy),
                                total=len(glob_mel)):
        # remove outliers
        energy = remove_outlier(np.load(energy))
        f0 = remove_outlier(np.load(f0))
        # partial fitting of scalers
        scaler_mel.partial_fit(np.load(mel))
        scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
        scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

    # save statistics to file
    logging.info("Saving computed statistics.")
    scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"),
                   (scaler_f0, "_f0")]
    save_statistics_to_file(scaler_list, config)
Example #2
0
def gen_audio_features(item, config):
    """Generate audio features and transformations
    Args:
        item (Dict): dictionary containing the attributes to encode.
        config (Dict): configuration dictionary.
    Returns:
        (bool): keep this sample or not.
        mel (ndarray): mel matrix in np.float32.
        energy (ndarray): energy audio profile.
        f0 (ndarray): fundamental frequency.
        item (Dict): dictionary containing the updated attributes.
    """
    # get info from sample.
    audio = item["audio"]
    utt_id = item["utt_id"]
    rate = item["rate"]

    # check audio properties
    assert len(audio.shape) == 1, f"{utt_id} seems to be multi-channel signal."
    assert np.abs(
        audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM."

    # check sample rate
    if rate != config["sampling_rate"]:
        audio = librosa.resample(audio, rate, config["sampling_rate"])
        logging.info(
            f"{utt_id} sampling rate is {rate}, not {config['sampling_rate']}, we resample it."
        )

    # trim silence
    if config["trim_silence"]:
        if "trim_mfa" in config and config["trim_mfa"]:
            _, item["text_ids"], audio = ph_based_trim(
                config,
                utt_id,
                item["text_ids"],
                item["raw_text"],
                audio,
                config["hop_size"],
            )
            if (
                    audio.__len__() < 1
            ):  # very short files can get trimmed fully if mfa didnt extract any tokens LibriTTS maybe take only longer files?
                logging.warning(
                    f"File have only silence or MFA didnt extract any token {utt_id}"
                )
                return False, None, None, None, item
        else:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"],
            )

    # resample audio if necessary
    if "sampling_rate_for_feats" in config:
        audio = librosa.resample(audio, rate,
                                 config["sampling_rate_for_feats"])
        sampling_rate = config["sampling_rate_for_feats"]
        assert (
            config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0
        ), "'hop_size' must be 'int' value. Please check if 'sampling_rate_for_feats' is correct."
        hop_size = config["hop_size"] * config[
            "sampling_rate_for_feats"] // rate
    else:
        sampling_rate = config["sampling_rate"]
        hop_size = config["hop_size"]

    # get spectrogram
    D = librosa.stft(
        audio,
        n_fft=config["fft_size"],
        hop_length=hop_size,
        win_length=config["win_length"],
        window=config["window"],
        pad_mode="reflect",
    )
    S, _ = librosa.magphase(D)  # (#bins, #frames)

    # get mel basis
    fmin = 0 if config["fmin"] is None else config["fmin"]
    fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
    mel_basis = librosa.filters.mel(
        sr=sampling_rate,
        n_fft=config["fft_size"],
        n_mels=config["num_mels"],
        fmin=fmin,
        fmax=fmax,
    )
    mel = np.log10(np.maximum(np.dot(mel_basis, S),
                              1e-10)).T  # (#frames, #bins)
    mel_eos = np.zeros(shape=[1, np.shape(mel)[1]
                              ])  # (1, #bins)  # represent mel for eos_token.
    mel = np.concatenate([mel, mel_eos], axis=0)  # (#frames + 1, #bins)

    # check audio and feature length
    audio_eos = np.zeros(
        shape=[hop_size])  # (hop_size)  # represent audio for eos_token.
    audio = np.concatenate([audio, audio_eos], axis=-1)
    audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
    audio = audio[:len(mel) * hop_size]
    assert len(mel) * hop_size == len(
        audio), f"{len(mel) * hope_size}, {len(audio)}"

    # extract raw pitch
    _f0, t = pw.dio(
        audio.astype(np.double),
        fs=sampling_rate,
        f0_ceil=fmax,
        frame_period=1000 * hop_size / sampling_rate,
    )
    f0 = pw.stonemask(audio.astype(np.double), _f0, t, sampling_rate)
    if len(f0) >= len(mel):
        f0 = f0[:len(mel)]
    else:
        f0 = np.pad(f0, (0, len(mel) - len(f0)))

    # extract energy
    energy = np.sqrt(np.sum(S**2, axis=0))
    energy = np.concatenate([energy, [0]],
                            axis=-1)  # # represent energy for eos_token.
    assert len(mel) == len(f0) == len(
        energy), f"{len(mel)}, {len(f0)}, {len(energy)}"

    # apply global gain
    if config["global_gain_scale"] > 0.0:
        audio *= config["global_gain_scale"]
    if np.abs(audio).max() >= 1.0:
        logging.warn(
            f"{utt_id} causes clipping. It is better to reconsider global gain scale value."
        )
    item["audio"] = audio
    item["mel"] = mel
    item["f0"] = remove_outlier(f0)
    item["energy"] = remove_outlier(energy)
    return True, mel, energy, f0, item
Example #3
0
def preprocess():
    """Run preprocessing process and compute statistics for normalizing."""
    config = parse_and_config()

    dataset_processor = {
        "ljspeech": LJSpeechProcessor,
        "kss": KSSProcessor,
        "multispeaker": ExampleMultispeaker,
    }

    dataset_cleaner = {
        "ljspeech": "english_cleaners",
        "kss": "korean_cleaners",
        "multispeaker": None,
    }

    logging.info(f"Selected '{config['dataset']}' processor.")
    processor = dataset_processor[config["dataset"]](
        config["rootdir"], cleaner_names=dataset_cleaner[config["dataset"]])

    # check output directories
    build_dir = lambda x: [
        os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True)
        for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"]
    ]
    build_dir("train")
    build_dir("valid")

    # build train test split
    if config["dataset"] == "multispeaker":
        train_split, valid_split, _, _ = train_test_split(
            processor.items,
            [i[-1] for i in processor.items],
            test_size=config["test_size"],
            random_state=42,
            shuffle=True,
        )
    else:
        train_split, valid_split = train_test_split(
            processor.items,
            test_size=config["test_size"],
            random_state=42,
            shuffle=True,
        )
    logging.info(f"Training items: {len(train_split)}")
    logging.info(f"Validation items: {len(valid_split)}")

    get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0]
    train_utt_ids = [get_utt_id(x) for x in train_split]
    valid_utt_ids = [get_utt_id(x) for x in valid_split]

    # save train and valid utt_ids to track later
    np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids)
    np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids)

    # define map iterator
    def iterator_data(items_list):
        for item in items_list:
            yield processor.get_one_sample(item)

    train_iterator_data = iterator_data(train_split)
    valid_iterator_data = iterator_data(valid_split)

    p = Pool(config["n_cpus"])

    # preprocess train files and get statistics for normalizing
    partial_fn = partial(gen_audio_features, config=config)
    train_map = p.imap_unordered(
        partial_fn,
        tqdm(train_iterator_data,
             total=len(train_split),
             desc="[Preprocessing train]"),
        chunksize=10,
    )
    # init scaler for multiple features
    scaler_mel = StandardScaler(copy=False)
    scaler_energy = StandardScaler(copy=False)
    scaler_f0 = StandardScaler(copy=False)

    id_to_remove = []
    for result, mel, energy, f0, features in train_map:
        if not result:
            id_to_remove.append(features["utt_id"])
            continue
        save_features_to_file(features, "train", config)
        # remove outliers
        energy = remove_outlier(energy)
        f0 = remove_outlier(f0)
        # partial fitting of scalers
        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
            id_to_remove.append(features["utt_id"])
            continue
        scaler_mel.partial_fit(mel)
        scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
        scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

    if len(id_to_remove) > 0:
        np.save(
            os.path.join(config["outdir"], "train_utt_ids.npy"),
            [i for i in train_utt_ids if i not in id_to_remove],
        )
        logging.info(
            f"removed {len(id_to_remove)} cause of too many outliers or bad mfa extraction"
        )

    # save statistics to file
    logging.info("Saving computed statistics.")
    scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"),
                   (scaler_f0, "_f0")]
    save_statistics_to_file(scaler_list, config)

    # preprocess valid files
    partial_fn = partial(gen_audio_features, config=config)
    valid_map = p.imap_unordered(
        partial_fn,
        tqdm(valid_iterator_data,
             total=len(valid_split),
             desc="[Preprocessing valid]"),
        chunksize=10,
    )
    for *_, features in valid_map:
        save_features_to_file(features, "valid", config)
Example #4
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of dumped raw features "
                    "(See detail in tensorflow_tts/bin/compute_statistics.py).")
    parser.add_argument("--rootdir", type=str, required=True,
                        help="directory including feature files. ")
    parser.add_argument("--config", type=str, required=True,
                        help="yaml format configuration file.")
    parser.add_argument("--outdir", default=None, type=str, required=True,
                        help="directory to save statistics.")
    parser.add_argument("--verbose", type=int, default=1,
                        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('Skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check directory existence
    if args.outdir is None:
        args.outdir = os.path.dirname(args.rootdir)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # get dataset
    if config["format"] == "npy":
        mel_query = "*-raw-feats.npy"
        f0_query = "*-raw-f0.npy"
        energy_query = "*-raw-energy.npy"
        mel_load_fn = np.load
    else:
        raise ValueError("Support only npy format.")

    dataset = MelDataset(
        args.rootdir,
        mel_query=mel_query,
        mel_load_fn=mel_load_fn
    ).create(batch_size=1)

    # calculate statistics
    scaler = StandardScaler()
    for mel, mel_length in tqdm(dataset):
        mel = mel[0].numpy()
        scaler.partial_fit(mel)

    # save to file
    stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
    np.save(os.path.join(args.outdir, "stats.npy"), stats.astype(np.float32), allow_pickle=False)

    # calculate statistic of f0
    f0_dataset = AudioDataset(
        args.rootdir,
        audio_query=f0_query,
        audio_load_fn=np.load,
    ).create(batch_size=1)

    pitch_vecs = []
    for f0, f0_length in tqdm(f0_dataset):
        f0 = f0[0].numpy()  # [T]
        f0 = remove_outlier(f0)
        pitch_vecs.append(f0)
    nonzeros = np.concatenate([v[np.where(v != 0.0)[0]]
                               for v in pitch_vecs])
    mean, std = np.mean(nonzeros), np.std(nonzeros)

    # save to file
    stats = np.stack([mean, std], axis=0)
    np.save(os.path.join(args.outdir, "stats_f0.npy"), stats.astype(np.float32), allow_pickle=False)

    # calculate statistic of energy
    energy_dataset = AudioDataset(
        args.rootdir,
        audio_query=energy_query,
        audio_load_fn=np.load,
    ).create(batch_size=1)

    energy_vecs = []
    for e, e_length in tqdm(energy_dataset):
        e = e[0].numpy()
        e = remove_outlier(e)
        energy_vecs.append(e)
    nonzeros = np.concatenate([v[np.where(v != 0.0)[0]]
                               for v in energy_vecs])
    mean, std = np.mean(nonzeros), np.std(nonzeros)

    # save to file
    stats = np.stack([mean, std], axis=0)
    np.save(os.path.join(args.outdir, "stats_energy.npy"), stats.astype(np.float32), allow_pickle=False)
Example #5
0
 def _norm_mean_std(self, x, mean, std):
     x = remove_outlier(x)
     zero_idxs = np.where(x == 0.0)[0]
     x = (x - mean) / std
     x[zero_idxs] = 0.0
     return x
Example #6
0
def preprocess_multispeaker():
    """Run preprocessing process and compute statistics for normalizing."""
    config = parse_and_config()

    # DIFFERENCE
    dataset_processor = {"multispeaker": MultiSpeakerProcessor}

    logging.info(f"Selected '{config['dataset']}' processor.")
    processor = dataset_processor[config["dataset"]](
        config["rootdir"], cleaner_names="english_cleaners")

    # check output directories
    build_dir = lambda x: [
        os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True)
        for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"]
    ]
    build_dir("train")
    build_dir("valid")

    # DIFFERENCE
    bul_items = processor.items[:671]
    synd_items = processor.items[671:]
    assert (
        len(bul_items) == 671 and len(synd_items) == 302
    ), f"SPLIT WAS UNSUCCESSFUL bul:{len(bul_items)} synd:{len(synd_items)}"

    train_split = []
    valid_split = []
    # build train test split
    bul_train_split, bul_valid_split = train_test_split(
        bul_items,
        test_size=config["test_size"],
        random_state=42,
        shuffle=True,
    )
    synd_train_split, synd_valid_split = train_test_split(
        synd_items,
        test_size=config["test_size"],
        random_state=42,
        shuffle=True,
    )
    train_split.extend(bul_train_split)
    train_split.extend(synd_train_split)
    valid_split.extend(bul_valid_split)
    valid_split.extend(synd_valid_split)
    assert (
        len(train_split) + len(valid_split) == 973
    ), f"SPLIT WAS UNSUCCESSFUL train:{len(train_split)} valid:{len(valid_split)}"

    logging.info(f"Training items: {len(train_split)}")
    logging.info(f"Validation items: {len(valid_split)}")

    get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0]
    train_utt_ids = [get_utt_id(x) for x in train_split]
    valid_utt_ids = [get_utt_id(x) for x in valid_split]

    # save train and valid utt_ids to track later
    np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids)
    np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids)

    # define map iterator
    def iterator_data(items_list):
        for item in items_list:
            yield processor.get_one_sample(item)

    train_iterator_data = iterator_data(train_split)
    valid_iterator_data = iterator_data(valid_split)

    p = Pool(config["n_cpus"])

    # preprocess train files and get statistics for normalizing
    partial_fn = partial(gen_audio_features, config=config)
    train_map = p.imap_unordered(
        partial_fn,
        tqdm(train_iterator_data,
             total=len(train_split),
             desc="[Preprocessing train]"),
        chunksize=10,
    )
    # init scaler for multiple features
    scaler_mel = StandardScaler(copy=False)
    scaler_energy = StandardScaler(copy=False)
    scaler_f0 = StandardScaler(copy=False)

    for mel, energy, f0, features in train_map:
        save_features_to_file(features, "train", config)
        # remove outliers
        energy = remove_outlier(energy)
        f0 = remove_outlier(f0)
        # partial fitting of scalers
        scaler_mel.partial_fit(mel)
        scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
        scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

    # save statistics to file
    logging.info("Saving computed statistics.")
    scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"),
                   (scaler_f0, "_f0")]
    save_statistics_to_file(scaler_list, config)

    # preprocess valid files
    partial_fn = partial(gen_audio_features, config=config)
    valid_map = p.imap_unordered(
        partial_fn,
        tqdm(valid_iterator_data,
             total=len(valid_split),
             desc="[Preprocessing valid]"),
        chunksize=10,
    )
    for *_, features in valid_map:
        save_features_to_file(features, "valid", config)
Example #7
0
def generate(data):

    tid = data["tid"]
    audio = data["audio"]
    mels = data["mels"]

    # If the fft size is 2048, audio with 22050Hz sample rate is processed in 93 millisecond increments.
    # If it is 512, it is processed by 23 milliseconds.

    # lpcnet spec
    fft_size = 320  # or 640 processed16000Hz 20ms or 40ms
    hop_size = 160  #
    samplerate = 16000

    # check audio properties
    assert len(audio.shape) == 1, f"{tid} seems to be multi-channel signal."
    assert np.abs(audio).max() <= 1.0, f"{tid} is different from 16 bit PCM."

    # get spectrogram
    D = librosa.stft(
        audio,
        n_fft=fft_size,
        hop_length=hop_size,  # default: win_length // 4
        win_length=None,  # default: win_length = n_fft 
        window='hann',  # default: cosine window (‘hann’)
        pad_mode="reflect")

    S, _ = librosa.magphase(D)  # (#bins, #frames)

    # check audio and feature length
    audio = np.pad(audio, (0, 3200), mode="edge")
    audio = audio[:mels * hop_size]
    assert mels * hop_size == len(audio)

    # extract raw pitch
    _f0, t = pw.dio(audio.astype(np.double),
                    fs=samplerate,
                    f0_ceil=7600,
                    frame_period=1000 * hop_size / samplerate)
    f0 = pw.stonemask(audio.astype(np.double), _f0, t, samplerate)
    if len(f0) >= mels:
        f0 = f0[:mels]
    else:
        f0 = np.pad(f0, (0, mels - len(f0)))

    # extract energy
    energy = np.sqrt(np.sum(S**2, axis=0))
    if len(energy) >= mels:
        energy = energy[:mels]
    else:
        energy = np.pad(energy, (0, mels - len(energy)))
    assert mels == len(f0) == len(energy)

    # remove outlier f0/energy
    f0 = remove_outlier(f0)
    energy = remove_outlier(energy)

    item = {}
    item["tid"] = tid
    item["f0"] = f0
    item["energy"] = energy
    return item