Beispiel #1
0
def clean_f0(f0, f_min=None):
    """
    removes single valued frames i.e. [0, 0, 123, 0 0] -> [0, 0, 0, 0, 0]. The f0-extractor is sensitive and sometimes
    classifies noise as f0.
    """

    # the minimum value for f0 is sometimes triggered (even for 10-60Hz) and seems to be mostly due to noise
    # they seem very out of distribution compared to all the regular f0 peaks and valleys.
    # the are constant for several frames
    if f_min is not None:
        f0[f0 == f_min] = 0

    if f0.ndim == 1:
        discrete = (f0 > 0).float()
        idx, dur, val = find_island_idx_len(discrete)
        dur = dur[val == 1]  # all duration 1
        idx = idx[val == 1][dur == 1]  # index for duration 1
        f0[idx] = 0
    elif f0.ndim == 2:
        for i, ch_f0 in enumerate(f0):
            discrete = (ch_f0 > 0).float()
            idx, dur, val = find_island_idx_len(discrete)
            dur = dur[val == 1]  # all duration 1
            idx = idx[val == 1][dur == 1]  # index for duration 1
            f0[i, idx] = 0
    return f0
Beispiel #2
0
def extract_pauses(vad):
    pauses = []
    ds_vad = vad_to_dialog_vad_states(vad)
    idx, dur, val = find_island_idx_len(ds_vad)  #
    for i, (state, state_duration) in enumerate(zip(val, dur)):
        if i == 0 or i == len(val) - 1:  # skip start/end
            continue
        prev_state = val[i - 1]
        next_state = val[i + 1]
        if state == 1:  #  current state is silence
            if prev_state in [0, 3]:
                if prev_state == next_state:  # pause
                    pauses.append(state_duration.item())
    return pauses
Beispiel #3
0
def interpolate_forward(f0, voiced):
    f = f0.clone()
    for i, v in enumerate(voiced):
        idx, dur, val = find_island_idx_len(v.float())
        # unvoiced -> value prior unvoiced
        dur = dur[val == 0]
        idx = idx[val == 0]
        for ii, dd in zip(idx, dur):
            if ii - 1 < 0:
                tmp_val = f[i, 0]
            else:
                tmp_val = f[i, ii - 1]
            f[i, ii:ii + dd] = tmp_val
    return f
Beispiel #4
0
def ipu_to_turns(ipu):
    # ipus separated by mutual silence are condensed into turns
    turns = ipu.clone()
    ipu_states = vad_to_dialog_vad_states(ipu)
    # state: 1 = both = Mutual silence
    for channel in range(2):
        starts, dur, val = find_island_idx_len(ipu[channel])
        starts = starts[val == 0]  # start of silence
        dur = dur[val == 0]
        for s, d in zip(starts, dur):
            tmp_states = ipu_states[s : s + d]
            # fill silences for ipus in one channel if there is only 'mutual silence' in the pauses
            if not (tmp_states != 1).sum() > 0:
                turns[channel, s : s + d] = 1
    return turns
Beispiel #5
0
def extract_state_duration(vad, state="silence"):
    """
    Extract the duration of a VAD-dialog `state`: 'silences', 'speaker' or 'overlap'

    Returns:
        duration:       Torch.Tensor
    """
    state_idx = {"speaker": 0, "silence": 1, "overlap": 2}
    assert state in state_idx

    i = state_idx[state]
    ds_vad = vad_to_dialog_vad_states(vad)
    idx, dur, val = find_island_idx_len(ds_vad)  #
    if state == "speaker":
        dur = torch.cat((dur[val == 0], dur[val == 3]))
    else:
        dur = dur[val == i]
    return dur
Beispiel #6
0
def extract_turn_floor_offset(vad, ipu_thresh=-1):
    """
    Extract the turn-floor offset based on VAD-dialog-states.

    :Caution: Use with smaller frame sizes for more accurate assesement

    1. Converts vad to states
    2. Iterate over the states
        - Find silences
            - omit pauses but add gap durations
            - ONLY states following SPEAKER - SILENCE - OTHER-SPEAKER is used
        - Find Overlaps
            - ONLY states following SPEAKER - OVERLAP - OTHER-SPEAKER is used

    SILENCES:
        NOT states:
            SPEAKER - OVERLAP - SILENCE - OTHER-SPEAKER
    OVERLAPS:
        Not states:
            SPEAKER - SILENCE - OVERLAP - OTHER-SPEAKER
        which would be a moment of competition for the turn-floor

    Return:
        turn_floor_offsets:     List
    """
    turn_floor_offsets = []
    if ipu_thresh > 0:
        vad = vad_to_ipu(vad, ipu_thresh)
    ds_vad = vad_to_dialog_vad_states(vad)
    idx, dur, val = find_island_idx_len(ds_vad)  #
    for i, (state, state_duration) in enumerate(zip(val, dur)):
        if i == 0 or i == len(val) - 1:  # skip start/end
            continue
        prev_state = val[i - 1]
        next_state = val[i + 1]
        if state == 1:  #  current state is silence
            if prev_state in [0, 3]:
                if prev_state != next_state:  # not pause
                    turn_floor_offsets.append(state_duration.item())
        elif state == 2:  # overlap
            if prev_state in [0, 3]:
                if prev_state != next_state:  # actual turn-shift after overlap
                    turn_floor_offsets.append(-state_duration.item())
    return turn_floor_offsets
Beispiel #7
0
def vad_to_ipu(vad, ipu_frame_thresh):
    """
    All silences in a single channel that are shorter than `ipu_frame_thresh` are filled.
    """
    ipu = vad.clone()
    for channel in range(2):
        start, dur, val = find_island_idx_len(vad[channel])
        sil_start = start[val == 0]
        sil_dur = dur[val == 0]

        fill_these = torch.where(sil_dur <= ipu_frame_thresh)[0]
        if len(fill_these) > 0:
            # if fill_these[0] == 0: # omit the start
            #     fill_these = fill_these[1:]
            fill_durs = sil_dur[fill_these]
            fill_starts = sil_start[fill_these]
            fill_ends = fill_starts + fill_durs
            for s, e in zip(fill_starts, fill_ends):
                ipu[channel, s:e] = 1
    return ipu
Beispiel #8
0
def omit_inside_overlap(turn):
    new_turn = turn.clone()
    # state 2: both = overlap
    turn_states = vad_to_dialog_vad_states(turn)
    starts, dur, val = find_island_idx_len(turn_states)
    # single - both - single
    # 0 - 2 - 0  | channel 1 should be omitted
    # 3 - 2 - 3  | channel 0 should be omitted
    omitted = 0
    for i, (prev, cur, post) in enumerate(zip(val[:-2], val[1:-1], val[2:])):
        current_index = i + 1
        if cur == 2:
            if prev == post == 0:
                s = starts[current_index]
                e = s + dur[current_index]
                new_turn[1, s:e] = 0
                omitted += 1
            elif prev == post == 3:
                s = starts[current_index]
                e = s + dur[current_index]
                new_turn[0, s:e] = 0
                omitted += 1
    # print("omitted: ", omitted)
    return new_turn
Beispiel #9
0
def add_explicit_turn_shift_token(x, EOT_token_id=None):
    input_ids = x["input_ids"]
    speaker_ids = x["speaker_ids"]
    word_ids = x.get("word_ids")
    starts = x.get("starts")  # only available for word_level dialogs
    ends = x.get("ends")  # only available for word_level dialogs

    # Sanity check
    if starts is None:
        assert len(input_ids) == len(speaker_ids)
    else:
        assert (len(input_ids) == len(speaker_ids) == len(starts) == len(ends)
                == len(word_ids))

    expl_input_ids = []
    expl_speaker_ids = []
    expl_word_ids = []
    expl_starts = []
    expl_ends = []
    expl_word_ids = []

    try:
        speaker_starts, dur, vval = find_island_idx_len(
            torch.tensor(speaker_ids))
    except:
        print("speaker_ids: ", speaker_ids.shape)
        print("input_ids: ", input_ids.shape)
        input()

    for s, d in zip(speaker_starts, dur):
        e = s + d

        # Add single EOT token at turn-shifts or use the next speaker token
        current_speaker = speaker_ids[s]
        if EOT_token_id is not None:
            expl_input_ids += [EOT_token_id] + input_ids[s:e]
            expl_speaker_ids += [current_speaker] + speaker_ids[s:e]
        else:
            expl_input_ids += [current_speaker] + input_ids[s:e]
            expl_speaker_ids += [current_speaker] + speaker_ids[s:e]

        expl_word_ids += [word_ids[s]] + word_ids[s:e]
        if starts is not None:
            expl_starts += [starts[s]] + starts[s:e]
        if ends is not None:
            expl_ends += [ends[s]] + ends[s:e]

    # sanity checks
    assert len(expl_input_ids) == len(expl_speaker_ids) == len(expl_word_ids)
    ret = {
        "input_ids": expl_input_ids,
        "speaker_ids": expl_speaker_ids,
        "word_ids": expl_word_ids,
    }
    if starts is not None:
        assert len(expl_starts) == len(expl_input_ids)
        ret["starts"] = expl_starts

    if ends is not None:
        assert len(expl_ends) == len(expl_input_ids)
        ret["ends"] = expl_ends

    return ret