def extract_channel_text(self, wav_path):
        name = basename(wav_path).replace(".sph", "")
        anno = [f for f in self.all_anno_paths if name in f]

        anno_channel0 = [f for f in self.all_anno_paths if name + "A" in f][0]
        anno_channel1 = [f for f in self.all_anno_paths if name + "B" in f][0]

        duration = get_duration_sox(wav_path)

        vad, words, timed_words, silence, noise = self.extract_text(
            anno_channel0, duration
        )
        vad1, words1, timed_words1, silence1, noise1 = self.extract_text(
            anno_channel1, duration
        )

        self.save_lists(
            name,
            duration,
            [vad, vad1],
            [words, words1],
            [timed_words, timed_words1],
            [silence, silence1],
            [noise, noise1],
        )
def get_all_vads_wavs(root_path, vad_path, audio_path, save=False):
    makedirs(vad_path, exist_ok=True)
    makedirs(audio_path, exist_ok=True)

    prefixes = set(
        [
            f[:5]
            for f in listdir(root_path)
            if isfile(join(root_path, f)) and not f.startswith("log")
        ]
    )
    for i in tqdm(prefixes):
        prefix = str(i).zfill(3)
        user_xml = join(root_path, prefix + "_user.xml")
        user_wav = join(root_path, prefix + "_user.wav")
        system_xml = join(root_path, prefix + "_system.xml")
        system_wav = join(root_path, prefix + "_system.wav")

        dur_user = get_duration_sox(user_wav)
        dur_system = get_duration_sox(system_wav)

        if dur_system > dur_user:
            dur = dur_system
        else:
            dur = dur_user

        vad_user_list, dur_user = get_vad(user_xml, user_wav, duration=dur)
        vad_sys_list, dur_system = get_vad(system_xml, system_wav, duration=dur)
        vad = [vad_user_list, vad_sys_list]

        if save:
            vpath = join(vad_path, prefix)
            makedirs(vpath, exist_ok=True)
            np.save(join(vpath, "vad.npy"), vad, allow_pickle=True)
            to_path = join(audio_path, f"{prefix}.wav")
            system(f"sox -M {user_wav} {system_wav} {to_path}")
        else:
            n_frames = round(dur / 0.05)
            vad_oh = list_percentage_to_onehot(vad, n_frames)
            plt.close()
            visualize_vad(vad_oh)
            plt.xlim([0, vad_oh.shape[1]])
            plt.tight_layout()
            plt.pause(0.01)
            input()
Esempio n. 3
0
def get_vads_holds_shifts_events_train(xml_path, wav_path):
    """
    Gets VAD from annotation of starts and ends. The events are manually labeled as what should have happened.
    The event times are at the end of the user utterances.

    In turntaking repo events are automatically annotated from what actually happened in the audio.
    events are stored as list of tuples and transformed to numpy with the following structure:

        event = [(time, next_speaker, prev_speaker), ..., (time, next_speaker, prev_speaker)]

    shifts and holds are then calculated from those values.
    """
    data = minidom.parse(xml_path)
    total_duration = get_duration_sox(wav_path)

    tracks = data.getElementsByTagName("track")
    tracks = [t.attributes["id"].value for t in tracks]
    ch0, ch1 = [], []
    holds, shifts = [], []
    events = []
    prev_speaker = False
    for seg in data.getElementsByTagName("segment"):
        start = float(seg.attributes["start"].value)
        end = float(seg.attributes["end"].value)
        features = seg.getElementsByTagName("features")
        act, feedback, misc = None, None, None
        if len(features) == 1:
            for f in features[0].getElementsByTagName("feature"):
                name = f.getAttribute("name")
                if "feedback" == name:
                    feedback = f.firstChild.data

        if seg.attributes["track"].value == tracks[0]:
            ch0.append((start, end))
            if feedback is not None:
                if feedback == "hold":
                    holds.append(end)
                    next_speaker = prev_speaker
                elif feedback == "respond":
                    shifts.append(end)
                    next_speaker = not prev_speaker
                elif feedback == "optional":
                    next_speaker = 2.0
                events.append((end, float(next_speaker), float(prev_speaker)))
        else:
            ch1.append((start, end))

    events = np.array(events, dtype=np.float32)
    events[:, 0] /= total_duration
    shifts = np.array(shifts, dtype=np.float32) / total_duration
    holds = np.array(holds, dtype=np.float32) / total_duration
    vad0 = np.array(ch0, dtype=np.float32) / total_duration
    vad1 = np.array(ch1, dtype=np.float32) / total_duration
    return (vad0, vad1), events, {"shifts": shifts, "holds": holds}
def get_annotation(fname, anno_path, audio_path):
    """
    Extracts vad-timings, words, silences and noise

    fname:          str, e.g 'q1ec1.wav'
    anno_path:      path, path to annotations
    data_path:      path, path to wavs (for duration)
    """

    # File info
    wav_path = join(audio_path, fname)
    duration = get_duration_sox(wav_path)

    name = fname.strip(".wav")
    tu_path = join(anno_path, "Data/timed-units")
    pos_path = join(anno_path, "Data/pos")
    token_path = join(anno_path, "Data/tokens")

    pos_xml_path = join(pos_path, name)
    tu_xml_path = join(tu_path, name)
    token_xml_path = join(token_path, name)

    tu_path_ch0 = tu_xml_path + ".g.timed-units.xml"
    tu_path_ch1 = tu_xml_path + ".f.timed-units.xml"
    pos_path_ch0 = pos_xml_path + ".g.pos.xml"
    pos_path_ch1 = pos_xml_path + ".f.pos.xml"
    token_path_ch0 = token_xml_path + ".g.tokens.xml"
    token_path_ch1 = token_xml_path + ".f.tokens.xml"

    # Find word xmls
    # contains words
    tu_ch0 = get_timed_units(tu_path_ch0, duration)
    tu_ch1 = get_timed_units(tu_path_ch1, duration)

    # Find word xmls
    pos_ch0 = get_pos(pos_path_ch0, tu_path_ch0, token_path_ch0, duration)
    pos_ch1 = get_pos(pos_path_ch1, tu_path_ch1, token_path_ch1, duration)

    vad = [tu_ch0["vad"], tu_ch1["vad"]]
    words = [tu_ch0["words"], tu_ch1["words"]]
    timed_words = [tu_ch0["timed_words"], tu_ch1["timed_words"]]
    pos = [pos_ch0, pos_ch1]
    silence = [tu_ch0["silence"], tu_ch1["silence"]]
    noise = [tu_ch0["noise"], tu_ch1["noise"]]
    return {
        "vad": vad,
        "words": words,
        "timed_words": timed_words,
        "pos": pos,
        "silence": silence,
        "noise": noise,
        "duration": duration,
    }
def save_switchboard_vad_treebank(audio_path, anno_path=None, save_path=None):
    if save_path is None:
        save_path = "data/vad_treebank"

    if anno_path is None:
        anno_path = "data/annotation_treebank/data/alignments"

    makedirs(save_path, exist_ok=True)

    # Get all annotation files
    anno_paths = glob(join(anno_path, "**/*.text"))
    anno_paths.sort()

    skip = []
    wav_files = [f for f in listdir(audio_path) if f.endswith(".wav")]
    for wav in tqdm(wav_files):
        name = wav.strip(".wav").replace("sw0", "sw")
        anno = [f for f in anno_paths if name in f]
        anno.sort()

        # Missing annotations ?
        if len(anno) == 0:
            skip.append(wav)
            continue

        # Paths
        session_path = join(save_path, wav.strip(".wav"))
        makedirs(session_path, exist_ok=True)
        word_path = join(session_path, "words.npy")
        vad_path = join(session_path, "vad.npy")
        noise_path = join(session_path, "noise.npy")
        silence_path = join(session_path, "silence.npy")

        # check if all features already exists
        if (exists(word_path) and exists(vad_path) and exists(silence_path)
                and exists(noise_path)):
            continue

        wpath = join(audio_path, wav)
        duration = get_duration_sox(wpath)

        vad0, words0, silence0, noise0 = extract_treebank_data(
            anno[0], duration)
        vad1, words1, silence1, noise1 = extract_treebank_data(
            anno[1], duration)

        np.save(word_path, (words0, words1), allow_pickle=True)
        np.save(vad_path, (vad0, vad1), allow_pickle=True)
        np.save(noise_path, (noise0, noise1), allow_pickle=True)
        np.save(silence_path, (silence0, silence1), allow_pickle=True)

    print(f"Skipped {len(skip)}/{len(wav_files)} files")
Esempio n. 6
0
def get_vad(xml_path, wav_path):
    total_duration = get_duration_sox(wav_path)
    data = minidom.parse(xml_path)
    tracks = data.getElementsByTagName("track")
    tracks = [t.attributes["id"].value for t in tracks]
    ch0, ch1 = [], []
    for seg in data.getElementsByTagName("segment"):
        start = float(seg.attributes["start"].value)
        end = float(seg.attributes["end"].value)
        if seg.attributes["track"].value == tracks[0]:
            ch0.append((start, end))
        else:
            ch1.append((start, end))
    vad0 = np.array(ch0, dtype=np.float32) / total_duration
    vad1 = np.array(ch1, dtype=np.float32) / total_duration
    return (vad0, vad1)
Esempio n. 7
0
def load_labels(wname, time_step=0.05):
    root = join(expanduser("~"), "SpeechCorpus/Robot/data/training_set")
    wav_path = join(root, "audio", wname + ".wav")
    v_path = join(root, "vad", wname, "vad.npy")
    e_path = join(root, "vad", wname, "events.npy")
    en_path = join(root, "vad", wname, "events_named.npy")

    vad = np.load(v_path, allow_pickle=True)
    event = np.load(e_path, allow_pickle=True)
    event_named = np.load(en_path, allow_pickle=True).item()

    dur = get_duration_sox(wav_path)
    n_frames = int(dur / time_step)

    vad_oh = get_onehot_vad(vad, n_frames)
    shifts = (event_named["shifts"] * n_frames).astype(np.int)
    holds = (event_named["holds"] * n_frames).astype(np.int)

    return vad_oh, shifts, holds
Esempio n. 8
0
    files = read_txt(hparams["data"]["train_files"])
    dset = ExtractFeaturesDataset(hparams, files, cache=False)

    d = dset["1_session_001"]
    f = dset.files[0]
    d = dset[f]

    wav_path = "/home/erik/SpeechCorpus/Robot/data/training_set/audio/1_session_001.wav"
    event_named_path = "/home/erik/SpeechCorpus/Robot/data/training_set/vad/1_session_001/events_named.npy"
    named = load_numpy(event_named_path).item()
    vad_path = (
        "/home/erik/SpeechCorpus/Robot/data/training_set/vad/1_session_001/vad.npy"
    )
    vad_idx = list(load_numpy(vad_path))

    dur = get_duration_sox(wav_path)

    n_frames = round(dur / hparams["features"]["time_step"])

    vad = np.zeros((2, n_frames))
    vad_idx[0] = (vad_idx[0] * n_frames).round()
    vad_idx[1] = (vad_idx[1] * n_frames).round()

    for i in range(2):
        for s, e in vad_idx[i]:
            vad[i, int(s) : int(e)] = 1

    events = {}
    for k, v in named.items():
        events[k] = np.zeros(n_frames)
        for i in (v * n_frames).round():