Esempi in Python per get_duration_sox, esempi in Python per speechcorpus.utils.get_duration_sox

Esempio n. 1

0

Mostra file

File: extract_timed_words.py Progetto: ErikEkstedt/SpeechCorpus

    def extract_channel_text(self, wav_path):
        name = basename(wav_path).replace(".sph", "")
        anno = [f for f in self.all_anno_paths if name in f]

        anno_channel0 = [f for f in self.all_anno_paths if name + "A" in f][0]
        anno_channel1 = [f for f in self.all_anno_paths if name + "B" in f][0]

        duration = get_duration_sox(wav_path)

        vad, words, timed_words, silence, noise = self.extract_text(
            anno_channel0, duration
        )
        vad1, words1, timed_words1, silence1, noise1 = self.extract_text(
            anno_channel1, duration
        )

        self.save_lists(
            name,
            duration,
            [vad, vad1],
            [words, words1],
            [timed_words, timed_words1],
            [silence, silence1],
            [noise, noise1],
        )

Esempio n. 2

0

Mostra file

File: vad_event_wizard.py Progetto: ErikEkstedt/SpeechCorpus

def get_all_vads_wavs(root_path, vad_path, audio_path, save=False):
    makedirs(vad_path, exist_ok=True)
    makedirs(audio_path, exist_ok=True)

    prefixes = set(
        [
            f[:5]
            for f in listdir(root_path)
            if isfile(join(root_path, f)) and not f.startswith("log")
        ]
    )
    for i in tqdm(prefixes):
        prefix = str(i).zfill(3)
        user_xml = join(root_path, prefix + "_user.xml")
        user_wav = join(root_path, prefix + "_user.wav")
        system_xml = join(root_path, prefix + "_system.xml")
        system_wav = join(root_path, prefix + "_system.wav")

        dur_user = get_duration_sox(user_wav)
        dur_system = get_duration_sox(system_wav)

        if dur_system > dur_user:
            dur = dur_system
        else:
            dur = dur_user

        vad_user_list, dur_user = get_vad(user_xml, user_wav, duration=dur)
        vad_sys_list, dur_system = get_vad(system_xml, system_wav, duration=dur)
        vad = [vad_user_list, vad_sys_list]

        if save:
            vpath = join(vad_path, prefix)
            makedirs(vpath, exist_ok=True)
            np.save(join(vpath, "vad.npy"), vad, allow_pickle=True)
            to_path = join(audio_path, f"{prefix}.wav")
            system(f"sox -M {user_wav} {system_wav} {to_path}")
        else:
            n_frames = round(dur / 0.05)
            vad_oh = list_percentage_to_onehot(vad, n_frames)
            plt.close()
            visualize_vad(vad_oh)
            plt.xlim([0, vad_oh.shape[1]])
            plt.tight_layout()
            plt.pause(0.01)
            input()

Esempio n. 3

0

Mostra file

def get_vads_holds_shifts_events_train(xml_path, wav_path):
    """
    Gets VAD from annotation of starts and ends. The events are manually labeled as what should have happened.
    The event times are at the end of the user utterances.

    In turntaking repo events are automatically annotated from what actually happened in the audio.
    events are stored as list of tuples and transformed to numpy with the following structure:

        event = [(time, next_speaker, prev_speaker), ..., (time, next_speaker, prev_speaker)]

    shifts and holds are then calculated from those values.
    """
    data = minidom.parse(xml_path)
    total_duration = get_duration_sox(wav_path)

    tracks = data.getElementsByTagName("track")
    tracks = [t.attributes["id"].value for t in tracks]
    ch0, ch1 = [], []
    holds, shifts = [], []
    events = []
    prev_speaker = False
    for seg in data.getElementsByTagName("segment"):
        start = float(seg.attributes["start"].value)
        end = float(seg.attributes["end"].value)
        features = seg.getElementsByTagName("features")
        act, feedback, misc = None, None, None
        if len(features) == 1:
            for f in features[0].getElementsByTagName("feature"):
                name = f.getAttribute("name")
                if "feedback" == name:
                    feedback = f.firstChild.data

        if seg.attributes["track"].value == tracks[0]:
            ch0.append((start, end))
            if feedback is not None:
                if feedback == "hold":
                    holds.append(end)
                    next_speaker = prev_speaker
                elif feedback == "respond":
                    shifts.append(end)
                    next_speaker = not prev_speaker
                elif feedback == "optional":
                    next_speaker = 2.0
                events.append((end, float(next_speaker), float(prev_speaker)))
        else:
            ch1.append((start, end))

    events = np.array(events, dtype=np.float32)
    events[:, 0] /= total_duration
    shifts = np.array(shifts, dtype=np.float32) / total_duration
    holds = np.array(holds, dtype=np.float32) / total_duration
    vad0 = np.array(ch0, dtype=np.float32) / total_duration
    vad1 = np.array(ch1, dtype=np.float32) / total_duration
    return (vad0, vad1), events, {"shifts": shifts, "holds": holds}

Esempio n. 4

0

Mostra file

File: extract_timed_words.py Progetto: ErikEkstedt/SpeechCorpus

def get_annotation(fname, anno_path, audio_path):
    """
    Extracts vad-timings, words, silences and noise

    fname:          str, e.g 'q1ec1.wav'
    anno_path:      path, path to annotations
    data_path:      path, path to wavs (for duration)
    """

    # File info
    wav_path = join(audio_path, fname)
    duration = get_duration_sox(wav_path)

    name = fname.strip(".wav")
    tu_path = join(anno_path, "Data/timed-units")
    pos_path = join(anno_path, "Data/pos")
    token_path = join(anno_path, "Data/tokens")

    pos_xml_path = join(pos_path, name)
    tu_xml_path = join(tu_path, name)
    token_xml_path = join(token_path, name)

    tu_path_ch0 = tu_xml_path + ".g.timed-units.xml"
    tu_path_ch1 = tu_xml_path + ".f.timed-units.xml"
    pos_path_ch0 = pos_xml_path + ".g.pos.xml"
    pos_path_ch1 = pos_xml_path + ".f.pos.xml"
    token_path_ch0 = token_xml_path + ".g.tokens.xml"
    token_path_ch1 = token_xml_path + ".f.tokens.xml"

    # Find word xmls
    # contains words
    tu_ch0 = get_timed_units(tu_path_ch0, duration)
    tu_ch1 = get_timed_units(tu_path_ch1, duration)

    # Find word xmls
    pos_ch0 = get_pos(pos_path_ch0, tu_path_ch0, token_path_ch0, duration)
    pos_ch1 = get_pos(pos_path_ch1, tu_path_ch1, token_path_ch1, duration)

    vad = [tu_ch0["vad"], tu_ch1["vad"]]
    words = [tu_ch0["words"], tu_ch1["words"]]
    timed_words = [tu_ch0["timed_words"], tu_ch1["timed_words"]]
    pos = [pos_ch0, pos_ch1]
    silence = [tu_ch0["silence"], tu_ch1["silence"]]
    noise = [tu_ch0["noise"], tu_ch1["noise"]]
    return {
        "vad": vad,
        "words": words,
        "timed_words": timed_words,
        "pos": pos,
        "silence": silence,
        "noise": noise,
        "duration": duration,
    }

Esempio n. 5

0

Mostra file

File: extract_annotation_vad.py Progetto: ErikEkstedt/SpeechCorpus

def save_switchboard_vad_treebank(audio_path, anno_path=None, save_path=None):
    if save_path is None:
        save_path = "data/vad_treebank"

    if anno_path is None:
        anno_path = "data/annotation_treebank/data/alignments"

    makedirs(save_path, exist_ok=True)

    # Get all annotation files
    anno_paths = glob(join(anno_path, "**/*.text"))
    anno_paths.sort()

    skip = []
    wav_files = [f for f in listdir(audio_path) if f.endswith(".wav")]
    for wav in tqdm(wav_files):
        name = wav.strip(".wav").replace("sw0", "sw")
        anno = [f for f in anno_paths if name in f]
        anno.sort()

        # Missing annotations ?
        if len(anno) == 0:
            skip.append(wav)
            continue

        # Paths
        session_path = join(save_path, wav.strip(".wav"))
        makedirs(session_path, exist_ok=True)
        word_path = join(session_path, "words.npy")
        vad_path = join(session_path, "vad.npy")
        noise_path = join(session_path, "noise.npy")
        silence_path = join(session_path, "silence.npy")

        # check if all features already exists
        if (exists(word_path) and exists(vad_path) and exists(silence_path)
                and exists(noise_path)):
            continue

        wpath = join(audio_path, wav)
        duration = get_duration_sox(wpath)

        vad0, words0, silence0, noise0 = extract_treebank_data(
            anno[0], duration)
        vad1, words1, silence1, noise1 = extract_treebank_data(
            anno[1], duration)

        np.save(word_path, (words0, words1), allow_pickle=True)
        np.save(vad_path, (vad0, vad1), allow_pickle=True)
        np.save(noise_path, (noise0, noise1), allow_pickle=True)
        np.save(silence_path, (silence0, silence1), allow_pickle=True)

    print(f"Skipped {len(skip)}/{len(wav_files)} files")

Esempio n. 6

0

Mostra file

def get_vad(xml_path, wav_path):
    total_duration = get_duration_sox(wav_path)
    data = minidom.parse(xml_path)
    tracks = data.getElementsByTagName("track")
    tracks = [t.attributes["id"].value for t in tracks]
    ch0, ch1 = [], []
    for seg in data.getElementsByTagName("segment"):
        start = float(seg.attributes["start"].value)
        end = float(seg.attributes["end"].value)
        if seg.attributes["track"].value == tracks[0]:
            ch0.append((start, end))
        else:
            ch1.append((start, end))
    vad0 = np.array(ch0, dtype=np.float32) / total_duration
    vad1 = np.array(ch1, dtype=np.float32) / total_duration
    return (vad0, vad1)

Esempio n. 7

0

Mostra file

File: get_baseline.py Progetto: ErikEkstedt/SpeechCorpus

def load_labels(wname, time_step=0.05):
    root = join(expanduser("~"), "SpeechCorpus/Robot/data/training_set")
    wav_path = join(root, "audio", wname + ".wav")
    v_path = join(root, "vad", wname, "vad.npy")
    e_path = join(root, "vad", wname, "events.npy")
    en_path = join(root, "vad", wname, "events_named.npy")

    vad = np.load(v_path, allow_pickle=True)
    event = np.load(e_path, allow_pickle=True)
    event_named = np.load(en_path, allow_pickle=True).item()

    dur = get_duration_sox(wav_path)
    n_frames = int(dur / time_step)

    vad_oh = get_onehot_vad(vad, n_frames)
    shifts = (event_named["shifts"] * n_frames).astype(np.int)
    holds = (event_named["holds"] * n_frames).astype(np.int)

    return vad_oh, shifts, holds

Esempio n. 8

0

Mostra file

File: get_baseline.py Progetto: ErikEkstedt/SpeechCorpus

    files = read_txt(hparams["data"]["train_files"])
    dset = ExtractFeaturesDataset(hparams, files, cache=False)

    d = dset["1_session_001"]
    f = dset.files[0]
    d = dset[f]

    wav_path = "/home/erik/SpeechCorpus/Robot/data/training_set/audio/1_session_001.wav"
    event_named_path = "/home/erik/SpeechCorpus/Robot/data/training_set/vad/1_session_001/events_named.npy"
    named = load_numpy(event_named_path).item()
    vad_path = (
        "/home/erik/SpeechCorpus/Robot/data/training_set/vad/1_session_001/vad.npy"
    )
    vad_idx = list(load_numpy(vad_path))

    dur = get_duration_sox(wav_path)

    n_frames = round(dur / hparams["features"]["time_step"])

    vad = np.zeros((2, n_frames))
    vad_idx[0] = (vad_idx[0] * n_frames).round()
    vad_idx[1] = (vad_idx[1] * n_frames).round()

    for i in range(2):
        for s, e in vad_idx[i]:
            vad[i, int(s) : int(e)] = 1

    events = {}
    for k, v in named.items():
        events[k] = np.zeros(n_frames)
        for i in (v * n_frames).round():