Ejemplo n.º 1
0
def fix_mono_lab_before_align(lab):
    f = hts.HTSLabelFile()
    f.append(lab[0])
    for i in range(1, len(lab)):
        # nothing to do
        f.append(lab[i], strict=False)
    return (f)
Ejemplo n.º 2
0
def test_hts_append():
    lab_path = join(DATA_DIR, "BASIC5000_0001.lab")
    test_labels = hts.load(lab_path)
    print("\n{}".format(test_labels))

    # should get same string representation
    labels = hts.HTSLabelFile()
    assert str(labels) == ""
    for label in test_labels:
        labels.append(label)
    assert str(test_labels) == str(labels)

    @raises(ValueError)
    def test_invalid_start_time():
        l = hts.HTSLabelFile()
        l.append((100000, 0, "NG"))

    def test_succeeding_times():
        l = hts.HTSLabelFile()
        l.append((0, 1000000, "OK"))
        l.append((1000000, 2000000, "OK"))

    @raises(ValueError)
    def test_non_succeeding_times():
        l = hts.HTSLabelFile()
        l.append((0, 1000000, "OK"))
        l.append((1500000, 2000000, "NG"))

    test_invalid_start_time()
    test_succeeding_times()
    test_non_succeeding_times()
Ejemplo n.º 3
0
def fix_mono_lab_after_align(lab):
    f = hts.HTSLabelFile()
    f.append(lab[0])
    for i in range(1, len(lab)):
        # fix consecutive pau/sil
        if ((f.contexts[-1] == "pau" or f.contexts[-1] == "sil")
                and (lab.contexts[i] == "pau" or lab.contexts[i] == "sil")):
            print("Consecutive pau/sil-s are detected.")
            d = round((f.end_times[-1] - f.start_times[-1]) / 2)
            f.end_times[-1] = f.start_times[-1] + d
            f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i]))
        elif (f.contexts[-1] == lab.contexts[i]
              and f.start_times[-1] == lab.start_times[i]
              and f.end_times[-1] == lab.end_times[i]):
            # duplicated vowel before "cl"?
            print(
                "{} and {} have the same start_time {} and end_time {}. There seems to be a missing phoneme in mono_dtw."
                .format(f.contexts[-1], lab.contexts[i], f.start_times[-1],
                        f.end_times[-1]))
            print()
            d = round((lab.end_times[i] - lab.start_times[i]) / 2)
            f.end_times[-1] = f.start_times[-1] + d
            f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i]))
        elif (f.end_times[-1] != lab.start_times[i]):
            # There is a gap between the end_times of the last phoneme and the start_times of the next phoneme
            print(
                "end_time {} of the phoneme {} and start_time {} of the phoneme {} is not the same. There seems to be a missing phoneme in sinsy_mono_round."
                .format(f.end_times[-1], f.contexts[-1], lab.start_times[i],
                        lab.contexts[i]))
            # expand lab.start_times[i] to f.end_times[-1]
            f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i]))
        else:
            f.append(lab[i], strict=False)
    return (f)
Ejemplo n.º 4
0
def postprocess_duration(labels, pred_durations, lag):
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    output_labels = hts.HTSLabelFile()

    for i in range(1, len(note_indices)):
        # Apply time lag
        p = labels[note_indices[i-1]:note_indices[i]]
        p.start_times = np.minimum(
            np.asarray(p.start_times) + lag[i-1].reshape(-1),
            np.asarray(p.end_times) - 50000 * len(p))
        p.start_times = np.maximum(p.start_times, 0)
        if len(output_labels) > 0:
            p.start_times = np.maximum(p.start_times, output_labels.start_times[-1] + 50000)

        # Compute normalized phoneme durations
        d = fe.duration_features(p)
        d_hat = pred_durations[note_indices[i-1]:note_indices[i]]
        d_norm = d[0] * d_hat / d_hat.sum()
        d_norm = np.round(d_norm)
        d_norm[d_norm <= 0] = 1

        # TODO: better way to adjust?
        if d_norm.sum() != d[0]:
            d_norm[-1] +=  d[0] - d_norm.sum()
        p.set_durations(d_norm)

        if len(output_labels) > 0:
            output_labels.end_times[-1] = p.start_times[0]
        for n in p:
            output_labels.append(n)

    return output_labels
Ejemplo n.º 5
0
Archivo: util.py Proyecto: r9y9/nnsvs
def _fix_mono_lab_after_align_default(lab):
    f = hts.HTSLabelFile()
    f.append(lab[0])
    for i in range(1, len(lab)):
        # fix consecutive pau/sil
        if (f.contexts[-1] == "pau"
                or f.contexts[-1] == "sil") and (lab.contexts[i] == "pau"
                                                 or lab.contexts[i] == "sil"):
            print("Consecutive pau/sil-s are detected.")
            d = round((f.end_times[-1] - f.start_times[-1]) / 2)
            f.end_times[-1] = f.start_times[-1] + d
            f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i]))
        elif f.end_times[-1] != lab.start_times[i]:
            # There is a gap between the end_times of the last phoneme and
            # the start_times of the next phoneme
            print(
                "end_time {} of the phoneme {} and start_time {} of the phoneme {} is not the same."
                .format(  # noqa
                    f.end_times[-1], f.contexts[-1], lab.start_times[i],
                    lab.contexts[i]))
            print(
                "There seems to be a missing phoneme in generated_mono_round.")
            # expand lab.start_times[i] to f.end_times[-1]
            f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i]))
        else:
            f.append(lab[i], strict=False)
    return f
Ejemplo n.º 6
0
def remove_sil_and_pau(lab):
    newlab = hts.HTSLabelFile()
    for label in lab:
        if "-sil" not in label[-1] and "-pau" not in label[-1]:
            newlab.append(label, strict=False)

    return newlab
Ejemplo n.º 7
0
def merge_sil(lab):
    N = len(lab)
    f = hts.HTSLabelFile()
    f.append(lab[0], strict=False)
    is_full_context = "@" in lab[0][-1]
    for i in range(1, N):
        if (is_full_context and "-sil" in f[-1][-1] and "-sil" in lab[i][-1]) \
            or (not is_full_context and f[-1][-1] == "sil" and lab[i][-1] == "sil"):
            # extend sil
            f.end_times[-1] = lab[i][1]
        else:
            f.append(lab[i], strict=False)
    return f
Ejemplo n.º 8
0
def _fix_mono_lab_after_align_default(lab):
    f = hts.HTSLabelFile()
    f.append(lab[0])
    for i in range(1, len(lab)):
        # fix contigous pau
        if (f.contexts[-1] == "pau" and lab.contexts[i] == "pau"
                and f.start_times[-1] == lab.start_times[i]
                and f.end_times[-1] == lab.end_times[i]):
            d = round((lab.end_times[i] - lab.start_times[i]) / 2)
            f.end_times[-1] = f.start_times[-1] + d
            f.append((f.end_times[-1], lab.end_times[i], lab.contexts[i]))
        else:
            f.append(lab[i], strict=False)
    return f
Ejemplo n.º 9
0
    os.makedirs(d, exist_ok=True)

sinsy = pysinsy.sinsy.Sinsy()
assert sinsy.setLanguages("j", pysinsy.get_default_dic_dir())

mono_lab_files = sorted(glob(join(args.pjs_root, "**/*.lab")))
muxicxml_files = sorted(glob(join(args.pjs_root, "**/*.musicxml")))
assert len(mono_lab_files) == len(muxicxml_files)
for mono_path, xml_path in zip(mono_lab_files, muxicxml_files):
    align_mono_lab = hts.load(mono_path)
    name = basename(mono_path)

    assert sinsy.loadScoreFromMusicXML(xml_path)
    # check if sinsy's phoneme output is same as the provided alignment format
    sinsy_labels = sinsy.createLabelData(True, 1, 1).getData()
    sinsy_mono_lab = hts.HTSLabelFile()
    for label in sinsy_labels:
        sinsy_mono_lab.append(label.split(), strict=False)

    assert len(align_mono_lab) == len(sinsy_mono_lab)
    assert (np.asarray(align_mono_lab.contexts) == np.asarray(
        sinsy_mono_lab.contexts)).all()

    # rounding
    has_too_short_ph = False
    for idx in range(len(align_mono_lab)):
        b, e = align_mono_lab.start_times[idx], align_mono_lab.end_times[idx]
        bb, ee = round(b / 50000) * 50000, round(e / 50000) * 50000
        # TODO: better way
        if bb == ee:
            # ensure minimum frame length 1
Ejemplo n.º 10
0
 def test_non_succeeding_times_wo_strict():
     l = hts.HTSLabelFile()
     l.append((0, 1000000, "OK"), strict=False)
     l.append((1500000, 2000000, "OK"), strict=False)
Ejemplo n.º 11
0
 def test_non_succeeding_times():
     l = hts.HTSLabelFile()
     l.append((0, 1000000, "OK"))
     l.append((1500000, 2000000, "NG"))
Ejemplo n.º 12
0
 def test_invalid_start_time():
     l = hts.HTSLabelFile()
     l.append((100000, 0, "NG"))
Ejemplo n.º 13
0
Archivo: gen.py Proyecto: r9y9/nnsvs
def postprocess_duration(labels, pred_durations, lag):
    """Post-process durations based on predicted time-lag

    Ref : https://arxiv.org/abs/2108.02776

    Args:
        labels (HTSLabelFile): HTS labels
        pred_durations (array or tuple): predicted durations for non-MDN,
            mean and variance for MDN
        lag (array): predicted time-lag

    Returns:
        HTSLabelFile: labels with adjusted durations
    """
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    is_mdn = isinstance(pred_durations, tuple) and len(pred_durations) == 2

    output_labels = hts.HTSLabelFile()

    for i in range(1, len(note_indices)):
        p = labels[note_indices[i - 1] : note_indices[i]]

        # Compute note duration with time-lag
        # eq (11)
        L = int(fe.duration_features(p)[0])
        if i < len(note_indices) - 1:
            L_hat = L - (lag[i - 1] - lag[i]) / 50000
        else:
            L_hat = L - (lag[i - 1]) / 50000

        # Prevent negative duration
        L_hat = max(L_hat, 1)

        # adjust the start time of the note
        p.start_times = np.minimum(
            np.asarray(p.start_times) + lag[i - 1].reshape(-1),
            np.asarray(p.end_times) - 50000 * len(p),
        )
        p.start_times = np.maximum(p.start_times, 0)
        if len(output_labels) > 0:
            p.start_times = np.maximum(
                p.start_times, output_labels.start_times[-1] + 50000
            )

        # Compute normalized phoneme durations
        if is_mdn:
            mu = pred_durations[0][note_indices[i - 1] : note_indices[i]]
            sigma_sq = pred_durations[1][note_indices[i - 1] : note_indices[i]]
            # eq (17)
            rho = (L_hat - mu.sum()) / sigma_sq.sum()
            # eq (16)
            d_norm = mu + rho * sigma_sq

            if np.any(d_norm <= 0):
                # eq (12) (using mu as d_hat)
                print(
                    f"Negative phoneme durations are predicted at {i}-th note. "
                    "The note duration: ",
                    f"{round(float(L)*0.005,3)} sec -> {round(float(L_hat)*0.005,3)} sec",
                )
                print(
                    "It's likely that the model couldn't predict correct durations "
                    "for short notes."
                )
                print(
                    f"Variance scaling based durations (in frame):\n{(mu + rho * sigma_sq)}"
                )
                print(
                    f"Fallback to uniform scaling (in frame):\n{(L_hat * mu / mu.sum())}"
                )
                d_norm = L_hat * mu / mu.sum()
        else:
            # eq (12)
            d_hat = pred_durations[note_indices[i - 1] : note_indices[i]]
            d_norm = L_hat * d_hat / d_hat.sum()

        d_norm = np.round(d_norm)
        d_norm[d_norm <= 0] = 1

        p.set_durations(d_norm)

        if len(output_labels) > 0:
            output_labels.end_times[-1] = p.start_times[0]
        for n in p:
            output_labels.append(n)

    return output_labels
Ejemplo n.º 14
0
def _process_utterance(out_dir, index, speaker_id, wav_path, lab_path,
                       binary_dict, continuous_dict, text):
    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    # determine sessionID and uttID
    wavbn = os.path.basename(wav_path)
    uttID = os.path.splitext(wavbn)[0]

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # time-aligned context
    if hparams.frame_shift_ms is None:
        frame_shift_in_micro_sec = (hparams.hop_size *
                                    10000000) // hparams.sample_rate
    else:
        frame_shift_in_micro_sec = hparams.frame_shift_ms * 10000
    labels = hts.HTSLabelFile(frame_shift_in_micro_sec)
    labels.load(lab_path)
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        frame_shift_in_micro_sec=frame_shift_in_micro_sec)

    Nwav = len(out) // audio.get_hop_size()
    out = out[:Nwav * audio.get_hop_size()]

    timesteps = len(out)

    context = linguistic_features

    # Write the spectrograms to disk:
    audio_filename = 'audio-' + uttID + '.npy'
    context_filename = 'context-' + uttID + '.npy'
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, context_filename),
            context.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, context_filename, timesteps, text, speaker_id)
Ejemplo n.º 15
0
if config is None:
    print(f"Cannot read config file: {sys.argv[1]}.")
    sys.exit(-1)

print("Copy original label files.")
files = sorted(
    glob(join(expanduser(config["db_root"]), "**/*.lab"), recursive=True))
dst_dir = join(config["out_dir"], "mono_label")
os.makedirs(dst_dir, exist_ok=True)
for m in tqdm(files):
    if config["spk"] == "natsumeyuuri":
        # natsume_singing
        name = splitext(basename(m))[0]
        if name in config["exclude_songs"]:
            continue
        h = hts.HTSLabelFile()
        with open(m) as f:
            for label in f:
                s, e, lab = label.strip().split()
                if config["label_time_unit"] == "sec":
                    s, e = int(float(s) * 1e7), int(float(e) * 1e7)
                h.append((s, e, lab))
            with open(join(dst_dir, basename(m)), "w") as of:
                of.write(str(fix_mono_lab_before_align(h, config["spk"])))
    else:
        # ofuton_p_utagoe_db, oniku_kurumi_utagoe_db
        name = splitext(basename(m))[0]
        if name in config["exclude_songs"]:
            continue
        f = hts.load(m)
        with open(join(dst_dir, basename(m)), "w") as of:
Ejemplo n.º 16
0
def segment_labels(lab,
                   strict=True,
                   threshold=1.0,
                   min_duration=5.0,
                   force_split_threshold=10.0):
    """Segment labels based on sil/pau

    Example:

    [a b c sil d e f pau g h i sil j k l]
    ->
    [a b c] [d e f] [g h i] [j k l]

    """
    segments = []
    seg = hts.HTSLabelFile()
    start_indices = []
    end_indices = []
    si = 0
    large_silence_detected = False

    for idx, (s, e, l) in enumerate(lab):
        d = (e - s) * 1e-7
        is_silence = _is_silence(l)

        if len(seg) > 0:
            # Compute duration except for long silences
            seg_d = compute_nosil_duration(seg)
        else:
            seg_d = 0

        # let's try to split
        # if we find large silence, force split regardless min_duration
        if (d > force_split_threshold) or (is_silence and d > threshold
                                           and seg_d > min_duration):
            if idx == len(lab) - 1:
                continue
            elif len(seg) > 0:
                if d > force_split_threshold:
                    large_silence_detected = True
                else:
                    large_silence_detected = False
                start_indices.append(si)
                si = 0
                end_indices.append(idx - 1)
                segments.append(seg)
                seg = hts.HTSLabelFile()
            continue
        else:
            if len(seg) == 0:
                si = idx
            seg.append((s, e, l), strict)

    if len(seg) > 0:
        seg_d = compute_nosil_duration(seg)
        # If the last segment is short, combine with the previous segment.
        if seg_d < min_duration and not large_silence_detected:
            end_indices[-1] = si + len(seg) - 1
        else:
            start_indices.append(si)
            end_indices.append(si + len(seg) - 1)

    #  Trim large sil for each segment
    segments2 = []
    start_indices_new, end_indices_new = [], []
    for s, e in zip(start_indices, end_indices):
        seg = lab[s:e + 1]

        # ignore "sil" or "pau" only segment
        if len(seg) == 1 and _is_silence(seg.contexts[0]):
            continue
        seg2, forward, backward = trim_long_sil_and_pau(seg,
                                                        return_indices=True)

        start_indices_new.append(s + forward)
        end_indices_new.append(s + backward)

        segments2.append(seg2)

    return segments2, start_indices_new, end_indices_new
Ejemplo n.º 17
0
assert sinsy.setLanguages("j", config["sinsy_dic"])

# generate full/mono labels by sinsy
print("Convert musicxml to label files.")
files = sorted(glob(join(expanduser(config["db_root"]), "**/*.*xml"), recursive=True))
for path in tqdm(files):
    name = splitext(basename(path))[0]
    if name in config["exclude_songs"]:
        continue

    assert sinsy.loadScoreFromMusicXML(path)
    for is_mono in [True, False]:
        n = "generated_mono" if is_mono else "generated_full"
        labels = sinsy.createLabelData(is_mono, 1, 1).getData()
        lab = hts.HTSLabelFile()
        for label in labels:
            lab.append(label.split(), strict=False)
        lab = merge_sil(lab)
        dst_dir = join(config["out_dir"], f"{n}")
        os.makedirs(dst_dir, exist_ok=True)
        with open(join(dst_dir, name + ".lab"), "w") as f:
            f.write(str(lab))
    sinsy.clearScore()

print("Copy original label files.")
files = sorted(glob(join(expanduser(config["db_root"]), "**/*.lab"), recursive=True))
dst_dir = join(config["out_dir"], "mono_label")
os.makedirs(dst_dir, exist_ok=True)
for m in tqdm(files):
    if config["spk"] == "natsumeyuuri":
Ejemplo n.º 18
0
from tqdm import tqdm
global DATA_ROOT
from  sklearn.preprocessing import StandardScaler,MinMaxScaler

sample_rate=22050
hop_size=256
frame_period =1000*hop_size/sample_rate   #5
#hop_size=int(frame_period*sample_rate/1000)
frame_shift_in_micro_sec=int(frame_period*10000)

fft_len=1024
mel_dim=80
window='hann'
fmin=50
fmax=7600
_hts=hts.HTSLabelFile(frame_shift_in_micro_sec=frame_shift_in_micro_sec)
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):