Beispiel #1
0
 def __init__(self, profile, bounds):
     self.speak_id = profile["xml:id"]
     self.ageGroup = profile["ageGroup"]
     self.role = profile["role"]
     self.sex = profile["sex"]
     self.soc = profile["soc"]
     self.dialect = profile["dialect"]
     self.phoneInterval = textgrid.IntervalTier("{} - phones".format(self.speak_id), bounds[0], bounds[1])
     self.wordInterval = textgrid.IntervalTier("{} - words".format(self.speak_id), bounds[0], bounds[1])
Beispiel #2
0
def convert(word_file_a, phone_file_a, word_file_b, phone_file_b,
            textgrid_file):
    """
    given the word and phone files for speakers a and b, converts them into one textgrid file

    Parameters
    ----------
    word_file_a : str
        path to .A.phonwords file
    phone_file_a : str
        path to .A.phones file
    word_file_b : str
        path to B.phonwords file
    phone_file_b : str
        path to B.phones file
    textgrid_file :  str
        path to desired resulting textgrid

    """
    error_count = 0

    tup_a = get_lists(phone_file_a, word_file_a)
    tup_b = get_lists(phone_file_b, word_file_b)

    phone_list_a, word_list_a, final_phone_end_a, final_word_end_a, phone_length_a, word_length_a = tup_a[
        0], tup_a[1], tup_a[2], tup_a[3], tup_a[4], tup_a[5]
    phone_list_b, word_list_b, final_phone_end_b, final_word_end_b, phone_length_b, word_length_b = tup_b[
        0], tup_b[1], tup_b[2], tup_b[3], tup_b[4], tup_b[5]

    phones_words = [phone_list_a, word_list_a, phone_list_b, word_list_b]

    all_tiers = []
    textgrid = tg.TextGrid()
    phone_tierA = tg.IntervalTier(name="phones_A")
    word_tierA = tg.IntervalTier(name="words_A")
    all_tiers.append(phone_tierA)
    all_tiers.append(word_tierA)

    phone_tierB = tg.IntervalTier(name="phones_B")
    word_tierB = tg.IntervalTier(name="words_B")
    all_tiers.append(phone_tierB)
    all_tiers.append(word_tierB)

    for i, tier in enumerate(all_tiers):
        for element in phones_words[i]:
            try:
                tier.add(float(element.start), float(element.end),
                         element.label)
            except ValueError:
                error_count += 1
        textgrid.append(tier)

    # textgrid.write(textgrid_file)

    return (error_count, int(phone_length_b) + int(phone_length_a))
Beispiel #3
0
def chunked_vad(grid, rate, frames, wav, get_extra=True, return_qual=False):
    '''VAD of just utterances'''
    utterances = chunk_utterances(grid)
    word = grid[grid.getNames().index("word")]
    phones = grid[grid.getNames().index("phone")]
    ratio_list = []

    if OUTPUT_UTTERANCES:
        quality = textgrid.IntervalTier(name="quality",\
                                        minTime=word.minTime,\
                                        maxTime=word.maxTime)
    for u in utterances:
        u_words = word[u]
        u_duration = u_words[-1].maxTime - u_words[0].minTime
        dur = second_to_frame(u_duration, rate)
        offset = second_to_frame(u_words[0].minTime - word.minTime, rate)
        ratio = get_section_vad(frames,
                                word,
                                dur,
                                rate,
                                offset=offset,
                                utterance=u_duration)
        ratio_list += [((u_words[0].minTime, u_words[-1].maxTime), ratio)]
        if OUTPUT_UTTERANCES:
            quality.add(u_words[0].minTime, u_words[-1].maxTime,
                        "{:.2f}".format(ratio[0]))

    if get_extra:
        swipe_feat = get_custom_features(wav, quality, phones)
        for i in range(len(swipe_feat[0][1])):
            swipe_features = textgrid.IntervalTier(name="swipe_features_{}".format(i),\
                                           minTime=grid.minTime,\
                                           maxTime=grid.maxTime)
            for phone in phones:
                for (feat_phone, feature) in swipe_feat:
                    if phone == feat_phone:
                        swipe_features.add(phone.minTime, phone.maxTime,
                                           str(feature[i]))
                        break
            grid.append(swipe_features)

    if return_qual:
        return quality

    if OUTPUT_UTTERANCES:
        grid.append(quality)
        if OUTPUT_FINE:
            grid.append(get_fine_detail(frames, word, rate))
        grid.write(
            f"{OUTPUT_DIR}/{grid.name.split('.TextGrid')[0]}_chunked.TextGrid")
    return ratio_list
Beispiel #4
0
def export_textgrid(config, path, wav_path=None):
    with CorpusContext(config) as c:
        discourses = c.discourses
        levels = c.hierarchy.annotation_types
        for d in discourses:
            grid = tg.TextGrid()
            tier = tg.IntervalTier()

            q = c.query_graph(c.phone)
            q = q.filter(c.phone.discourse.name == d)
            q = q.order_by(c.phone.begin)
            res = q.all()

            for phone in res:
                tier.add(phone.begin / SAM_RATE, phone.end / SAM_RATE,
                         phone.label)
            grid.append(tier)
            speaker = d.split("_")[0]
            just_file = d.split("_")[1]
            filename = just_file + ".TextGrid"
            per_speaker_path = os.path.join(path, speaker)
            if not os.path.exists(per_speaker_path):
                os.mkdir(per_speaker_path)
            grid.write(os.path.join(per_speaker_path, filename))
            top_filename = just_file + ".top"
            export_tops(res, os.path.join(per_speaker_path, top_filename))
            if wav_path is not None:
                try:
                    print("running ", just_file)
                    path_to_wav = wav_path[just_file]
                    mfcc(path_to_wav, os.path.join(per_speaker_path),
                         "mfcc_16khz.conf")
                except KeyError:
                    pass
Beispiel #5
0
    def save_textgrid(self, output_file: str):
        tg = textgrid.TextGrid()
        words = textgrid.IntervalTier('words')
        phones = textgrid.IntervalTier('phones')
        for word in self.words:
            words.add(word.start / 16000, word.end / 16000, word.label)

        if word.end < self.duration:
            words.add(word.end / 16000, self.duration / 16000, '')

        for phone in self.base_units:
            phones.add(phone.start / 16000, phone.end / 16000, phone.label)

        tg.append(words)
        tg.append(phones)

        with open(output_file, 'w') as f:
            tg.write(f)
Beispiel #6
0
    def save_textgrid(self, filename):
        """Save alignment as textgrid"""
        # Construct phoneme tier
        phon_tier = textgrid.IntervalTier('phone', self.start(), self.end())
        for phoneme in self.phonemes():
            mark = 'sil' if str(phoneme) == pypar.SILENCE else str(phoneme)
            phon_tier.add(phoneme.start(), phoneme.end(), mark)

        # Construct word tier
        word_tier = textgrid.IntervalTier('word', self.start(), self.end())
        for word in self:
            word_tier.add(word.start(), word.end(), str(word))

        # Construct textgrid
        grid = textgrid.TextGrid(Path(filename).stem, self.start(), self.end())
        grid.extend([phon_tier, word_tier])

        # Save
        grid.write(filename)
Beispiel #7
0
def examine_phonemes(grid):
    '''Iterates over the phones and flags unrealistic ones'''
    phones = grid[grid.getNames().index("phone")]
    bad_phones = textgrid.IntervalTier(name="bad_phones",\
                                       minTime=phones.minTime,\
                                       maxTime=phones.maxTime)
    bad_phones.add(phones.minTime, phones.maxTime, "good")
    for phone in phones:
        if phone.mark not in ["{LG}", "sil", "ns", "sp"
                              ] and phone.duration() > 2:
            overwrite_interval(bad_phones, phone.minTime - BAD_PHONE_BUFFER,
                               phone.maxTime + BAD_PHONE_BUFFER, "bad")
            #This phoneme is not realistic
    grid.append(bad_phones)
    return grid
def extract_from_textgrid(label_path):

    error_num = 0
    phoneme_num = 0
    syllable_num = 0

    target_seg_tier = textgrid.IntervalTier('phoneme')

    py_grid = textgrid.TextGrid.fromFile(label_path)

    source_tier = py_grid.tiers[0]

    duration = source_tier.maxTime - source_tier.minTime

    assert source_tier != None

    temp_list = []
    for i, interval in enumerate(source_tier):
        phn = interval.mark.strip()

        # check phn
        if phn not in get_all_phon():
            print(str(interval) + ' in file: ' + raw_path)
            error_num += 1
        else:
            target_seg_tier.addInterval(interval)
            phoneme_num += 1

        # check pinyin
        if phn in get_shengmu():
            temp_list.append(phn)
            continue
        elif len(temp_list) > 0:
            temp_list.append(phn)
            dict_key = str(temp_list)
            if dict_key not in pp_dict_reverse:
                print(str(interval) + ' in file: ' + raw_path)
                error_num += 1
            else:
                syllable_num += 1
            temp_list.clear()
        else:
            pass
            # dict_key = str([phn])
            # if dict_key not in pp_dict_reverse:
            #     print(interval)

    return duration, phoneme_num, syllable_num, error_num
def processAli(ali_ls):
    int_list = []
    for counter, line in enumerate(ali_ls, 0):
        if counter > 0:
            l_list = line[:-1].split("\t")
            phon_start = decimal.Decimal(l_list[4])
            phon_end = phon_start + decimal.Decimal(l_list[5])
            phon_lab = "SIL" if l_list[6] == "SIL" else l_list[6].split("_")[0]
            int_list.append((phon_start, phon_end, phon_lab))
    phon_tier = textgrid.IntervalTier(name="PHONEMES",
                                      minTime=0,
                                      maxTime=int_list[-1][1])
    for i in int_list:
        if i[0] == i[1]:
            continue
        phon_tier.add(i[0], i[1], phon_dict[i[2]])
    phon_textgrid = textgrid.TextGrid(name=None,
                                      minTime=0,
                                      maxTime=int_list[-1][1])
    phon_textgrid.append(phon_tier)
    return phon_textgrid
Beispiel #10
0
def get_fine_detail(frames, word, rate):
    '''Print exactly where the VAD detected speech'''
    frame_width = second_to_frame(SAMPLE_SIZE / 1000, rate)
    minTime = float(word.minTime)
    maxTime = float(word.maxTime - word.minTime)
    fine_detail = textgrid.IntervalTier(name="fine_detail",\
                                    minTime=word.minTime,\
                                    maxTime=word.maxTime)
    last_mark = VAD[2].is_speech(frames[0:frame_width], rate)
    start = word.minTime
    duration = second_to_frame(maxTime, rate)
    for t in range(0, duration, frame_width):
        inter = word.indexContaining(t / rate / 2 + minTime)
        if inter is not None:
            speech = VAD[2].is_speech(frames[t:t + frame_width], rate)
            if last_mark != speech and start != t / rate / 2 + minTime:
                fine_detail.add(start, min(t / rate / 2 + minTime,
                                           word.maxTime), str(last_mark))
                start = t / rate / 2 + minTime
                last_mark = speech
    return fine_detail
Beispiel #11
0
def write_textgrid(phone_sequence, sample_rate, filename):
    """
    Given a sequence of tuples of the form (label, start_sample, end_sample),
    output a .TextGrid file showing the phone labels and boundaries. Requires 
    Kyle Gorman's textgrid library for Python, available at 
    https://github.com/kylebgorman/textgrid.
    (Do pip install git+http://github.com/kylebgorman/textgrid.git for easy installation)
    """
    sample_dur = 1 / sample_rate
    end_time = phone_sequence[-1][2] * sample_dur
    print("end_time=", end_time)

    my_textgrid = tg.TextGrid(name='filename', maxTime=end_time)
    my_tier = tg.IntervalTier(name='phones', maxTime=end_time)
    for phone in phone_sequence:
        my_tier.add(minTime=phone[1] * sample_dur,
                    maxTime=phone[2] * sample_dur,
                    mark=str(phone[0]))
    my_textgrid.append(my_tier)

    my_textgrid.write(filename)

    pass
Beispiel #12
0
def read_textgrid_tier(textgrid_filename, vot_tier):
    # read TextGrid
    textgrid = tg.TextGrid()
    textgrid.read(textgrid_filename)

    # extract tier names
    tier_names = textgrid.getNames()

    # check if the VOT tier is one of the tiers in the TextGrid
    vots = tg.IntervalTier(minTime=textgrid.minTime, maxTime=textgrid.maxTime)

    if vot_tier in tier_names:
        # run over all intervals in the tier
        for interval in textgrid.getFirst(vot_tier):
            if re.search(r'\S', interval.mark):
                vots.addInterval(interval)
    else:
        logging.error("Tier %s not found in TextGrid %s" %
                      (vot_tier, textgrid_filename))
        logging.error(
            "(If you think the tier is there, perhaps there's extra whitespace in the tier name?)"
        )

    return vots
def makeData(from_file, to_file, core):
    for counter, wp in enumerate(wavpaths[from_file:to_file + 1], 1):
        rate, sig = scipy.io.wavfile.read(wp)
        np_mfcc = python_speech_features.mfcc(sig,
                                              rate,
                                              winlen=window,
                                              winstep=step)
        np_mfcc_d = python_speech_features.delta(np_mfcc, 2)
        np_mfcc_dd = python_speech_features.delta(np_mfcc_d, 2)
        np_mfcc_all = np.append(np.append(np_mfcc, np_mfcc_d, axis=1),
                                np_mfcc_dd,
                                axis=1)
        #        print(np_mfcc_all.shape)
        wn = wp.split("/")[-1]
        # get corpus info
        if wn[0] in ["F", "M"]:
            corpus = "ifa"
        elif wn[0] == "D":
            corpus = "ifadv"
        elif wn[0] == "p":
            corpus = "ecsd"
        else:
            corpus = "cgn-" + wn[0]
        #
        sent_id = ".".join(wn.split(".")[:-1])
        print(core, counter, "/", to_file - from_file, sent_id)
        tg_path = af_path + chunk_folder + sent_id + ".TextGrid"
        tg = textgrid.TextGrid()
        with makeTempFile(tg_path) as tempf:
            tg.read(tempf.name)
        intervals = tg.tiers[2].intervals
        end_time = round(intervals[-1].maxTime, 3)
        start_time = round(intervals[0].minTime, 3)
        # get number of syllables for each word
        word_intervals = tg.tiers[0].intervals
        word_syls = []
        word_segs = []
        label_dictionary = {key: [] for key in features}
        for w in word_intervals:
            n_syls = 0
            word_start = round(w.minTime, 3)
            word_end = round(w.maxTime, 3)
            first_seg = tg.tiers[2].indexContaining(word_start + 0.001)
            assert first_seg == 0 or intervals[first_seg - 1].mark[-1] == "#"
            last_seg = tg.tiers[2].indexContaining(word_end - 0.001)
            assert intervals[last_seg].mark[-1] == "#"
            n_segs = last_seg + 1 - first_seg
            word_segs.append(n_segs)
            for seg in intervals[first_seg:last_seg + 1]:
                if seg.mark.strip("#") in vowels:
                    n_syls += 1
            word_syls.append(n_syls)
        assert len(word_syls) == len(word_intervals)
        comments = tg.tiers[1].intervals
        #        print(start_time, end_time)
        classes = np.zeros((0, num_cols_per_frame))
        int_i = 0
        num_frames = np_mfcc_all.shape[0]
        useable_frame_indices = []
        for frame in range(1, num_frames + 1):
            frame_s = round(start_time + (frame - 1) * step, 3)
            frame_e = frame_s + window
            if frame_e > end_time:  # because '0' samples can be appended to sig so it can be divided by an integer of frames
                frame_e = end_time
            intvl = intervals[int_i]
            if frame_s < round(intvl.minTime, 3):
                print(frame_s, round(intvl.minTime, 3))
            assert frame_s >= round(intvl.minTime, 3)
            if frame_e <= round(intvl.maxTime, 3):
                # calculate the proportion of the frame that is within the useable centre of the interval
                # this needs to be done separately for the nasalization feature, which should be aligned with end of intervals
                # for feat in ["@", "n", "~"]
                # pass feat to getFeatureLabel
                # which returns labels for each feat
                # construct label_list after for loop completes
                label_list = []
                for feat in features:
                    label = getFeatureLabel(frame_s, frame_e, feat, int_i,
                                            tg.tiers[2], tg.tiers[0],
                                            word_syls, word_segs, comments)
                    label_list.append(label)
                    label_dictionary[feat].append(label)
                if sum(label_list) > -3:
                    useable_frame_indices.append(frame - 1)
                row = np.array(
                    [np.append(np_mfcc_all[frame - 1, ], label_list)])
                classes = np.append(classes, row, axis=0)
            else:
                assert frame_e > round(intvl.maxTime, 3)
                proportions = [(round(intvl.maxTime, 3) - frame_s, int_i)]
                new_int = intvl
                new_int_i = int_i
                next_int_i = int_i
                while frame_e > round(new_int.maxTime, 3):
                    new_int_i += 1
                    new_int = intervals[new_int_i]
                    overlap = (frame_e - round(new_int.minTime, 3)
                               ) if frame_e <= round(new_int.maxTime, 3) else (
                                   round(new_int.maxTime, 3) -
                                   round(new_int.minTime, 3))
                    proportions.append((overlap, new_int_i))
                    if (frame_s + step) >= round(new_int.minTime, 3):
                        next_int_i = new_int_i
                best_int_i = max(proportions)[1]
                # calculate the proportion of the frame that is within the useable centre of the interval
                label_list = []
                for feat in features:
                    label = getFeatureLabel(frame_s, frame_e, feat, best_int_i,
                                            tg.tiers[2], tg.tiers[0],
                                            word_syls, word_segs, comments)
                    label_list.append(label)
                    label_dictionary[feat].append(label)
                if sum(label_list) > -3:
                    useable_frame_indices.append(frame - 1)
                row = np.array(
                    [np.append(np_mfcc_all[frame - 1, ], label_list)])
                classes = np.append(classes, row, axis=0)
                int_i = next_int_i


#        print(useable_frame_indices)
        for old_row in range(classes.shape[0]):
            if (old_row >= 2 * frame_window) and ((old_row - frame_window)
                                                  in useable_frame_indices):
                new_labels = classes[old_row - frame_window,
                                     num_cols_per_frame - len(features):]
                new_feat = classes[old_row - (2 * frame_window):old_row +
                                   1, :num_cols_per_frame -
                                   len(features)].flatten()
                new_row = np.array([
                    np.append(np.append(new_feat, corpora[corpus]), new_labels)
                ])
                # samples = np.append(samples, new_row, axis=0)
                with open(
                        af_path + "AF_en" + str(int(core) + running_cores) +
                        ".csv", "a") as f:
                    np.savetxt(f, new_row, fmt='%.5e', delimiter=",")
        if produce_output_tg:
            tg_out = textgrid.TextGrid(minTime=float(start_time))
            for af in features:
                tier = textgrid.IntervalTier(name=af,
                                             minTime=float(start_time))
                prev_class = -1
                min_time = float(start_time)
                max_time = min_time + (
                    window - step
                ) / 2 + step  # in reality we can't draw clear boundaries between labels; we would need a 0, 1, and -1 tier for each feature to show overlapping frames
                for frame_n, lab_cl in enumerate(label_dictionary[af], 1):
                    if prev_class != lab_cl:
                        if frame_n != 1:
                            tier.add(min_time, max_time, str(prev_class))
                            min_time = max_time
                    max_time = float(start_time) + (window -
                                                    step) / 2 + frame_n * step
                    if len(label_dictionary[af]) == frame_n:
                        max_time += (window - step) / 2
                        tier.add(min_time, max_time, str(lab_cl))
                        break
                    prev_class = lab_cl
                tg_out.append(tier)
            with open(
                    af_path + "en_training_labels_tgs/" + sent_id +
                    ".TextGrid", "w") as f:
                tg_out.write(f)
Beispiel #14
0
        # for the purposes of testing,
        # just match the one file
        if name.endswith(".TextGrid"):
            print("Processing {}".format(name))

            tg = textgrid.TextGrid()
            tg.read(os.path.join(root, name))

            # also make a new TG to write to
            ntg = textgrid.TextGrid()

            ## start iterating through the tiers
            for tier in tg:

                # make new interval tier with speaker name
                ntier = textgrid.IntervalTier(name=tier.name)

                # loop through the intervals
                wordList = []
                intList = []
                for interval in tier.intervals:

                    # add non-silence to a word list
                    if interval.mark != args.separator:
                        wordList.append(interval.mark)
                        intList.append(interval)

                    else:
                        # check it's not just the start
                        if len(wordList) > 0:
                            # merge the list together,
Beispiel #15
0
def convert(word_file_a, phone_file_a,word_file_b, phone_file_b, textgrid_file):
    """
    given the word and phone files for speakers a and b, converts them into one textgrid file

    Parameters
    ----------
    word_file_a : str
        path to .A.phonwords file
    phone_file_a : str
        path to .A.phones file
    word_file_b : str
        path to B.phonwords file
    phone_file_b : str
        path to B.phones file
    textgrid_file :  str
        path to desired resulting textgrid

    """


    tup_a = get_lists(phone_file_a, word_file_a)
    tup_b = get_lists(phone_file_b, word_file_b)

    phone_list_a, word_list_a, final_phone_end_a, final_word_end_a, phone_length_a, word_length_a = tup_a[0], tup_a[1], tup_a[2], tup_a[3], tup_a[4], tup_a[5]
    phone_list_b, word_list_b, final_phone_end_b, final_word_end_b, phone_length_b, word_length_b = tup_b[0], tup_b[1], tup_b[2], tup_b[3], tup_b[4], tup_b[5]

    phones_words = [phone_list_a, word_list_a, phone_list_b, word_list_b]

    all_tiers = []
    textgrid = tg.TextGrid()
    phone_tierA = tg.IntervalTier(name = "A - phone")
    word_tierA = tg.IntervalTier(name = "A - word")
    all_tiers.append(phone_tierA)
    all_tiers.append(word_tierA)

    phone_tierB = tg.IntervalTier(name = "B - phone")
    word_tierB = tg.IntervalTier(name = "B - word")
    all_tiers.append(phone_tierB)
    all_tiers.append(word_tierB)


    finished_tiers = []
    for i,tier in enumerate(all_tiers):
        for element in phones_words[i]:
            try:
                tier.add(float(element.start), float(element.end), element.label)
            except (ValueError, IndexError) as e:
                continue
        finished_tiers.append(tier)

    for i, tier in enumerate(finished_tiers):
        if (i+1)%2==0 and i>0:
            fixed = extend_last(finished_tiers[i-1], finished_tiers[i])
            finished_tiers[i-1] = fixed[0]
            finished_tiers[i] = fixed[1]

            textgrid.append(finished_tiers[i-1])
            textgrid.append(finished_tiers[i])



    textgrid.write(textgrid_file)
    filter(lambda x: x.endswith(file_ending), os.listdir(TEMP_DIR)))
for subtitle in subtitle_files:
    #Check there's an audio file associated
    audio = subtitle.replace(file_ending, ".wav")
    if not os.path.isfile(os.path.join(TEMP_DIR, audio)):
        print("Audio file {} is missing".format(audio))
        continue

    speaker = subtitle.split(".")[0]
    #Make sure the subtitles don't overlap as there are overlapping subtitles
    #in youtube's auto-generated subs
    captions = webvtt.read(os.path.join(TEMP_DIR, subtitle)).captions

    #Even captions have the time of the utterance (as well as the time of the individual words)
    #Odd captions have the actual utterance string
    tier = textgrid.IntervalTier(speaker)
    for cap_time, cap_string in zip(captions[::2], captions[1::2]):
        tier.add(cap_time.start_in_seconds, cap_time.end_in_seconds, \
                cap_string.text.strip())

    tg = textgrid.TextGrid()
    tg.append(tier)
    tg.write(
        os.path.join(TEXTGRID_DIR, subtitle.replace(file_ending, ".TextGrid")))
    shutil.move(os.path.join(TEMP_DIR, audio), \
                os.path.join(TEXTGRID_DIR, audio))

if not args.skip_mfa:
    subprocess.run([os.path.join(MFA_BIN, "mfa_align"), TEXTGRID_DIR, \
            args.mfa_dict, args.mfa_model, ALIGNED_DIR, "--verbose"])
Beispiel #17
0
def gentle_to_grid(gentle_file, out_file=None):
    "Convert *.json file from Gentle to Praat TextGrid"
    if '*' in gentle_file:
        if out_file is not None:
            raise TypeError("out can not be set during batch-conversion")
        for filename in glob(gentle_file):
            gentle_to_grid(filename)
        return

    gentle_file = Path(gentle_file)
    if out_file is None:
        out_file = gentle_file.with_suffix('.TextGrid')
    else:
        out_file = Path(out_file)
        if out_file.suffix.lower() != '.textgrid':
            out_file = out_file.with_suffix('.TextGrid')

    with gentle_file.open() as fid:
        g = json.load(fid)

    # find valid words
    words = g['words']
    n_issues = 0
    for i, word in enumerate(words):
        if word['case'] == 'success':
            if word['alignedWord'] == '<unk>':
                n_issues += 1
                word['issue'] = 'OOV'
            else:
                word['issue'] = None
        else:
            n_issues += 1
            word['issue'] = word['case']

    # add missing times
    last_end = 0
    not_in_audio_words = []  # buffer
    for word in words:
        if 'start' in word:
            if not_in_audio_words:
                duration = word['start'] - last_end
                for j, word_ in enumerate(not_in_audio_words):
                    word_['start'] = last_end + j * duration
                    word_['end'] = last_end + (j + 1) * duration
                not_in_audio_words = []
            last_end = word['end']
        else:
            not_in_audio_words.append(word)
    for word in not_in_audio_words:
        word['start'] = last_end
        word['end'] = last_end = last_end + 0.100

    # round times
    for word in words:
        word['start'] = round(word['start'], 3)
        word['end'] = round(word['end'], 3)

    # avoid overlapping words
    last_start = words[-1]['end'] + 1
    for word in reversed(words):
        if word['end'] > last_start:
            word['end'] = last_start
        if word['start'] >= word['end']:
            word['start'] = word['end'] - .001
        last_start = word['start']
        # gentle seems to work at 10 ms resolution
        if word['end'] - word['start'] < 0.015 and 'issue' not in word:
            word['issue'] = 'short'

    # log issues
    if n_issues:
        log = fmtxt.Table('rrrll')
        log.cell('Time')
        log.cell('Duration', width=2)
        log.cells('Word', 'Issue')
        log.midrule()
        for word in words:
            if word['issue']:
                duration = word['end'] - word['start']
                d_marker = '*' if duration < 0.015 else ''
                log.cells(f"{word['start']:.3f}", d_marker, f"{duration:.3f}",
                          word['word'], word['issue'])
        print(log)
        log.save_tsv(out_file.with_suffix('.log'))

    # build textgrid
    phone_tier = textgrid.IntervalTier('phones')
    word_tier = textgrid.IntervalTier('words')
    for i, word in enumerate(words):
        t = word['start']
        word_tstop = word['end']
        # add word
        word_tier.add(t, word_tstop, word['word'])
        # make sure we have at least one phone
        phones = word.get('phones', ())
        if not phones:
            phones = ({'phone': '', 'duration': word['end'] - word['start']}, )
        # add phones
        for phone in phones:
            tstop = min(round(t + phone['duration'], 3), word_tstop)
            if t >= tstop:
                continue
            mark = phone['phone'].split('_')[0].upper()
            if mark == 'OOV':
                continue
            phone_tier.add(t, tstop, mark)
            t = tstop
    grid = textgrid.TextGrid()
    grid.extend((phone_tier, word_tier))
    grid.write(out_file)
Beispiel #18
0
                     float_format="%.2f")
 pd_samples = pd.DataFrame(data=unseen_samples, columns=_CSV_COLUMNS)
 predict_test_input_fn = create_predict_input_fn(pd_samples,
                                                 batch_size=batch_size)
 tg = textgrid.TextGrid(minTime=float(start_time))
 for af in ["s"]:
     classifier = classifiers[af]
     classifier_output = list(
         classifier.predict(input_fn=predict_test_input_fn))
     predictions = np.array(
         [item['class_ids'][0] for item in classifier_output])
     print(af, predictions)
     probabilities = np.array(
         [item['probabilities'] for item in classifier_output])
     # construct textgrids
     tier = textgrid.IntervalTier(name=af, minTime=float(start_time))
     prev_class = 99
     min_time = float(start_time)
     max_time = min_time + frame_window * step
     tier.add(
         min_time, max_time, ""
     )  # add the empty interval for the initial buffer (due to windowing over preceding 5 frames)
     min_time = max_time
     # create 'probability' tier
     if af in expected_features:
         prob_i = expected_features[af]
         af_probs = list(probabilities[:, prob_i])
         #            print(af_probs)
         with open(
                 tens_path + "pred_textgrids/sep_mod/" + fragment_id + "_" +
                 af + ".IntensityTier", "w") as f:
Beispiel #19
0
def predict_files(core_num="", start_line=1, end_line=num_index_lines):
    for fp in file_paths[start_line - 1:end_line]:
        fragment_id = ".".join(fp.split(".")[:-1]).split("/")[-1]
        # get corpus info
        if fragment_id[0] in ["F", "M"]:
            corpus = "ifa"
        elif fragment_id[0] == "D":
            corpus = "ifadv"
        elif fragment_id[0] == "p":
            corpus = "ecsd"
        else:
            corpus = "cgn-" + frag_fol
#            corpus = "cgn-" + fragment_id[0]
#            fn_num = int(fragment_id.split("_")[0][2:])
#            corpus = "cgn-o" if fn_num > 1000 and fn_num < 1566 else "cgn-a"
        #
        print(core_num, fragment_id)
        fragment_id_split = fragment_id.split("_")
        wav = "_".join(fragment_id_split[:-3])
        chan, start_time, end_time = fragment_id_split[-3:]
#        wav, chan, start_time, end_time = fragment_id.split("_")
        rate, sig = scipy.io.wavfile.read(fp)
        np_mfcc = python_speech_features.mfcc(sig, rate, winlen=window, winstep=step)
        np_mfcc_d = python_speech_features.delta(np_mfcc, 2)
        np_mfcc_dd = python_speech_features.delta(np_mfcc_d, 2)
        np_mfcc_all = np.append(np.append(np_mfcc, np_mfcc_d, axis=1), np_mfcc_dd, axis=1)
        unseen_samples = np.zeros((0, num_feat))
        for old_row in range(np_mfcc_all.shape[0]):
            if old_row >= 2 * frame_window:
                new_feat = np_mfcc_all[old_row - (2 * frame_window):old_row + 1, :num_feat_per_frame].flatten()
                new_row = np.array([np.append(new_feat, corpora[corpus])])
#                print(unseen_samples.shape, np.array(new_row).shape)
                unseen_samples = np.append(unseen_samples, np.array(new_row), axis=0)
        # save unseen samples for inspection
        summary_text = pd.DataFrame(np_mfcc).describe()
        summary_text.to_csv(tens_path + "pred_mfcc/" + corpus + "/" + fragment_id + "_sum.csv", float_format="%.2f")
        pd_samples = pd.DataFrame(data=unseen_samples, columns=_CSV_COLUMNS)
        predict_test_input_fn = create_predict_input_fn(pd_samples, batch_size=batch_size)
        tg = textgrid.TextGrid(minTime=float(start_time))
        for af in ["s"]:
            classifier = classifiers[af]
            classifier_output = list(classifier.predict(input_fn=predict_test_input_fn))
            predictions = np.array([item['class_ids'][0] for item in classifier_output])
            print(af, predictions)
            probabilities = np.array([item['probabilities'] for item in classifier_output])
            # construct textgrids
            tier = textgrid.IntervalTier(name=af, minTime=float(start_time))
            prev_class = 99
            min_time = float(start_time)
            max_time = min_time + frame_window * step
            tier.add(min_time, max_time, "")    # add the empty interval for the initial buffer (due to windowing over preceding 5 frames)
            min_time = max_time
            # create 'probability' tier
            if af in expected_features:
                prob_i = expected_features[af]
                af_probs = list(probabilities[:, prob_i])
#                print(af_probs)
                with open(tens_path + "pred_textgrids/" + corpus + "/" + fragment_id + "_" + af + ".IntensityTier", "w") as f:
                    f.write('File type = "ooTextFile"\nObject class = "IntensityTier"\n\n{}\n{}\n{}\n'.format(start_time, end_time, len(af_probs)))
                    for frame_i, prob in enumerate(af_probs, 0):
                        f_time = min_time + frame_i * step
                        f.write('{}\n{}\n'.format(f_time, prob))
            for frame_n, pred_cl in enumerate(predictions, 1):
#                print(frame_n, pred_cl, prev_class, min_time, max_time, end_time, features[af][pred_cl])
                if prev_class != pred_cl:
                    if frame_n != 1:
                        tier.add(min_time, max_time, features[af][prev_class])
                        min_time = max_time
                max_time = float(start_time) + (frame_window * step) + frame_n * step
#                if max_time > float(end_time):
#                    max_time = float(end_time)
                if len(predictions) == frame_n:  # or max_time == float(end_time):
                    max_time += (window - step)
                    tier.add(min_time, max_time, features[af][pred_cl])
                    break
                prev_class = pred_cl
            if max_time < float(end_time):  # add the empty interval for the final buffer (due to windowing over subsequent 5 frames)
                tier.add(max_time, float(end_time), "")
            tg.append(tier)
        with open(tens_path + "pred_textgrids/" + corpus + "/" + fragment_id + "_" + af + ".TextGrid", "w") as f:
            tg.write(f)
Beispiel #20
0
            tg = textgrid.TextGrid()
            tg.read(os.path.join(root, name))

            for tier in tg.getNames():
                # ignore non-transcript tiers
                if re.search('(transcript.*)', tier) is None:
                    continue

                # make list of 'transcript' tiers & iterate through them
                transcriptTiers = re.findall('(transcript.*)', tier)
                for item in transcriptTiers:
                    pos = tg.getNames().index(item)

                    # create interval tier and
                    # and cleaned intervals
                    wt = textgrid.IntervalTier(name=item)
                    tr = tg.getList(item)[0]

                    # remove original tier from TextGrid tier list
                    tg.tiers.pop(pos)

                    for interval in tr:
                        try:
                            wt.add(interval.minTime, interval.maxTime,
                                   strip_punct(interval.mark))
                        except Exception as e:
                            print(name, e)

                    # add new tier to list
                    tg.append(wt)
            # save textgrid
Beispiel #21
0
        new_tiers = [[] for x in tiers]
        for utterance in arg_grid[arg_grid.getNames().index("classification")]:
            if utterance.mark == "good":
                for tier, new_tier in zip(tiers, new_tiers):
                    bounds = (tier.indexContaining(float(utterance.minTime) + 0.0001), \
                              tier.indexContaining(float(utterance.maxTime) - 0.0001) + 1)
                    for j in range(bounds[0], bounds[1]):
                        interval = tier.intervals[j]
                        if interval.minTime >= utterance.minTime and interval.maxTime <= utterance.maxTime:
                            new_tier.append(interval)

        out_grid = textgrid.TextGrid(name=arg.split("/")[-1],
                                     minTime=arg_grid.minTime,
                                     maxTime=arg_grid.maxTime)
        for tier, new_tier in zip(tiers, new_tiers):
            out_tier = textgrid.IntervalTier(tier.name, tier.minTime,
                                             tier.maxTime)
            out_tier.intervals = new_tier
            out_grid.append(out_tier)
        out_grid.write(
            f"{OUTPUT_DIR}/{arg_grid.name.split('.TextGrid')[0]}_cleaned.TextGrid"
        )
        print(f"{i+1}/{len(sys.argv[1:])}")
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        print(f"problematic file, {arg}")
        problematic_files += [arg]

with open("bad_files.txt", "a+") as f:
    for x in problematic_files:
        f.write(f"{x}\n")
Beispiel #22
0
def get_utterances_p2_4(file, textgrid_file):
    """ 
    read utterance from .trn file and write them to textgrid format
    collapses utterances with <.15s pauses between them into 1 utterance
    textgrid tiers are based on speakers (each speaker |-> 1 tier). For textgrids in Part1
    
    Parameters
    ----------
    file : str
        path to .trn file
    textgrid_file : str
        path to textgrid file
    """

    with open(file) as f1:
        lines = f1.readlines()

    ordered_tups = []
    speakers = defaultdict(list)
    skipped = 0

    for i, line in enumerate(lines):
        if '\x00' in line:
            line = "c".join(line.split('\x00'))
        splitline = re.split("\t+", line)

        start = splitline[0]
        end = splitline[1]
        if splitline[2].strip().endswith(":"):
            speaker = splitline[2]
            try:
                label = splitline[3]
            except IndexError:
                label = ""
        else:
            speaker = ordered_tups[i - 1][2]
            try:
                label = splitline[2]
            except IndexError:
                label = ""

        label = remove_regex.sub("", label)
        label = sub_regex.sub("", label)
        label = breath_regex.sub("BREATH_WORD_SB", label)
        label = laugh_regex.sub("LAUGH_WORD_SB", label)
        label = re.sub("([a-zA-Z]-)", r"[\1]", label)

        speaker = re.sub("[:\s]", "", speaker)
        speaker = re.sub(">env", "env", speaker.lower())
        # if label not in ["", " ", None]:
        ordered_tups.append((start, end, speaker, label))
        speakers[speaker.lower()].append((start, end, label))

    speakers = clean(speakers)

    textgrid = tg.TextGrid()
    for i, speaker in enumerate(speakers.keys()):
        tier = tg.IntervalTier(name="{} - utterance".format(speaker.strip()))
        for j, tup in enumerate(speakers[speaker]):
            try:
                if float(tup[0]) == float(tup[1]):
                    continue
            except ValueError:
                pass
            try:
                tier.add(float(tup[0]), float(tup[1]), tup[2].strip())
            except ValueError:
                try:
                    previous = tier[-1]
                except IndexError:
                    skipped += 1
                    continue
                previous_end = previous.maxTime
                difference = previous.maxTime - float(tup[0])
                if difference < 0:
                    skipped += 1
                    continue
                if float(tup[0]) + difference == float(tup[1]):
                    skipped += 1
                    continue
                tier.add(
                    float(tup[0]) + difference, float(tup[1]), tup[2].strip())
        if len(tier.intervals) > 0:
            textgrid.append(tier)

    textgrid.write(textgrid_file)

    print("skipped: {}".format(skipped))
Beispiel #23
0
        if name.endswith(".TextGrid"):
            print("Processing {}".format(name), end="...")

            ## read textgrid
            tg = textgrid.TextGrid()
            tg.read(os.path.join(root, name))

            # search textgrid for a tier with a specific name
            # if no name specified, look at the first tier
            for Tier in tg:
                if args.kana_tier is None:
                    args.kana_tier = tg.getNames()[0]

                if Tier.name == args.kana_tier:

                    romajiTier = textgrid.IntervalTier()
                    if args.speaker_as_dir:
                        path = os.path.basename(
                            os.path.dirname(os.path.join(root, name)))
                        romajiTier.name = path
                    else:
                        romajiTier.name = os.path.splitext(name)[0]

                    # convert kanji to romaji
                    for interval in Tier.intervals:

                        # skip intervals with dummy silence
                        if interval.mark == "#":
                            continue

                        # take interval information from original
    elif args.outformat == "textgrid":

        txg = textgrid.TextGrid()
        txg.minTime = 0
        transcr_wrd_starts = endpoints[0]
        transcr_wrd_ends = endpoints[1]
        transcr_wrd_txt = endpoints[2]
        transcr_ph_starts = endpoints[3]
        transcr_ph_ends = endpoints[4]
        transcr_ph_txt = endpoints[5]

        offsets = decoded_results["output"][0]["offsets"]

        txg.endTime = offsets[len(offsets) - 1]

        words_tier = textgrid.IntervalTier("words", 0,
                                           offsets[len(offsets) - 1])
        phones_tier = textgrid.IntervalTier("phones", 0,
                                            offsets[len(offsets) - 1])

        for jx in range(0, len(transcr_wrd_txt) - 1):
            #print(transcr_wrd_txt[jx]+" "+str(transcr_wrd_starts[jx])+" "+str(transcr_wrd_ends[jx]))
            #if transcr_wrd_starts[jx] == transcr_wrd_ends[jx]: transcr_wrd_ends[jx] += 0.01
            words_tier.addInterval(
                textgrid.Interval(transcr_wrd_starts[jx], transcr_wrd_ends[jx],
                                  transcr_wrd_txt[jx]))

        for px in range(0, len(transcr_ph_txt) - 1):
            #print(transcr_ph_txt[px]+" "+str(transcr_ph_starts[px])+" "+str(transcr_ph_ends[px]))
            #if transcr_ph_starts[jx] == transcr_ph_ends[jx]: transcr_ph_ends[jx] += 0.01
            #if transcr_ph_txt[px] == " ": transcr_ph_txt[px] = "SIL"
            phones_tier.addInterval(
Beispiel #25
0
import textgrid
import numpy as np

def linear_classifier(X):
    return X["word-number"] > 1 and X["duration"] > 1 and X["mfa_found"] == 1 and X["hnr"] > 5.4 and X["alignment-diff"] < 0.03

OUTPUT_DIR = "out_with_labels"
uts = 0
good_uts = 0

for i, arg in enumerate(sys.argv[1:]):
    if not os.path.isfile(arg):
        print("{} is not a valid file".format(arg))
    arg_grid = textgrid.TextGrid(name=arg.split("/")[-1])
    arg_grid.read(arg)
    classification = textgrid.IntervalTier(name="classification", minTime=arg_grid.minTime, maxTime=arg_grid.maxTime)
    word = arg_grid[arg_grid.getNames().index("word")]
    hnr_quality = arg_grid[arg_grid.getNames().index("hnr")]
    mfa_found =  arg_grid[arg_grid.getNames().index("mfa_found")]
    for utterance in arg_grid[arg_grid.getNames().index("quality")]:
        bounds = slice(word.indexContaining(float(utterance.minTime)+0.001), \
                         word.indexContaining(float(utterance.maxTime)-0.001) + 1)
        hnr_index = hnr_quality.indexContaining(utterance.minTime + utterance.duration()/2)
        mfa_index = mfa_found.indexContaining(utterance.minTime + utterance.duration()/2)
        if utterance.mark != "":
            X = {"duration": float(utterance.duration()),
                 "alignment-diff": float(utterance.mark),
                 "hnr": float(hnr_quality[hnr_index].mark),
                 "mfa_found": float(mfa_found[mfa_index].mark),
                 "word-number": len(word[bounds])}
            if X["duration"] > 1 and X["word-number"] > 1:
Beispiel #26
0
def main(args):
    model, waveglow, denoiser, hparams = load_tts_vocoder_models(
        args.tacotron_checkpoint_path, args.waveglow_checkpoint_path)
    cmu_dict = load_cmudict(args.cmudict_path)
    # phoneme_pairs, texts = parse_input(args.input_text_file)
    texts = parse_input(args.input_text_file)

    phoneme_pairs = [
        'W V', 'IH0 IY0', 'IH1 IY1', 'IH2 IY2', 'EH0 AE0', 'EH1 AE1',
        'EH2 AE2', 'NG N', 'S TH', 'Z S', 'AA0 AH0', 'AA1 AH1', 'AA2 AH2',
        'UW0 UH0', 'UW1 UH1', 'UW2 UH2', 'DH D'
    ]

    # assert len(phoneme_pairs) == len(texts), "Lines of phoneme pairs and texts must be the same, please check" \
    #                                          "the input text file."

    wav_dir = args.output_dir.joinpath('wav')
    wav_dir.mkdir(exist_ok=True, parents=True)
    trans_dir = args.output_dir.joinpath('transcript')
    trans_dir.mkdir(exist_ok=True, parents=True)
    tg_dir = args.output_dir.joinpath('annotation')
    tg_dir.mkdir(exist_ok=True, parents=True)

    utt_i = 0
    for text in texts:
        for phoneme_pair in phoneme_pairs:
            # Synthesize speech
            phoneme_pair = phoneme_pair.split()
            try:
                text_arpabet = text_to_arpabet(
                    cmu_dict,
                    phoneme_pair,
                    text,
                    swap_phoneme=args.mispronunciation)
            except:
                continue
            if re.sub(r'[^\w]', '',
                      text_arpabet).strip() == re.sub(r'[^\w]', '',
                                                      text).strip():
                continue

            sequence = np.array(
                text_to_sequence(text_arpabet, ['english_cleaners']))[None, :]
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).cuda().long()
            _, mel_outputs_postnet, _, alignments, is_max_steps = model.inference(
                sequence)
            if is_max_steps:
                continue
            with torch.no_grad():
                wav = waveglow.infer(mel_outputs_postnet, sigma=0.666)
            # wav_denoised = denoiser(wav, strength=0.01)[:, 0].cpu().numpy().T
            output_wav_file = wav_dir.joinpath('{:s}_{:04d}.wav'.format(
                args.prefix, utt_i + 1))
            wavfile.write(output_wav_file, hparams.sampling_rate,
                          wav.cpu().numpy().T)

            # Save transcript
            output_trans_file = trans_dir.joinpath('{:s}_{:04d}.txt'.format(
                args.prefix, utt_i + 1))
            with open(output_trans_file, 'w') as f:
                f.write(text)

            # Generate textgrid
            # The startTime and endTime are just indices, not alignment boundarys. Do not use it.
            tg = textgrid.TextGrid()
            word_tier = textgrid.IntervalTier(name='words')
            text = re.sub(r'[^\w ]', '', text)
            idx = 0
            for word in text.split():
                word_tier.add(float(idx), float(idx + 1), word)
                idx += 1

            idx = 0
            phone_tier = textgrid.IntervalTier(name='phones')
            #             phone_tier.add(float(idx), float(idx + 1), 'sil')
            #             idx += 1
            for word in text.split():
                arpabet = cmu_dict.lookup(word)[0]
                for phoneme in arpabet.split():
                    if args.mispronunciation:
                        if phoneme == phoneme_pair[0]:
                            phone_tier.add(
                                float(idx), float(idx + 1),
                                phoneme + ',' + phoneme_pair[1] + ',s')
                        elif phoneme == phoneme_pair[1]:
                            phone_tier.add(
                                float(idx), float(idx + 1),
                                phoneme + ',' + phoneme_pair[0] + ',s')
                        else:
                            phone_tier.add(float(idx), float(idx + 1), phoneme)
                    else:
                        phone_tier.add(float(idx), float(idx + 1), phoneme)
                    idx += 1


#             phone_tier.add(float(idx), float(idx + 1), 'sil')
            tg.append(word_tier)
            tg.append(phone_tier)

            tg_file = tg_dir.joinpath('{:s}_{:04d}.TextGrid'.format(
                args.prefix, utt_i + 1))
            tg.write(tg_file)

            print('{:d}: {:s} | {:s} | {:s}'.format(utt_i + 1, text,
                                                    ' '.join(phoneme_pair),
                                                    text_arpabet))
            utt_i += 1
Beispiel #27
0
    if not os.path.isfile(arg):
        print("{} is not a valid file".format(arg))
        continue
    try:
        grid = textgrid.TextGrid(name=arg.split("/")[-1])
        grid.read(arg)
    except (AttributeError, ValueError):
        print(f"{arg.split('/')[-1]} can't load")
        continue
    utterances = chunk_utterances(grid)
    words = grid[grid.getNames().index("word")]
    phones = grid[grid.getNames().index("phone")]
    wav = grid.name.split('_')[0]
    name = grid.name.split('.TextGrid')[0]

    crop_wav(f"{WAV_DIR}/{wav}.wav", f"{OUTPUT_DIR}/{name}.wav", grid.minTime,
             grid.maxTime)
    out_grid = textgrid.TextGrid(name)
    speaker = textgrid.IntervalTier("speaker", 0, grid.maxTime - grid.minTime)
    for i, utterance in enumerate(utterances):
        utterance_text = ""
        maxTime = words[utterance.stop - 1].maxTime - grid.minTime
        minTime = words[utterance.start].minTime - grid.minTime
        for word in words[utterance]:
            if word.mark != "sp":
                utterance_text += " " + remap_words(word, phones)
        speaker.add(minTime, maxTime, utterance_text.strip())

    out_grid.append(speaker)
    out_grid.write(f"{OUTPUT_DIR}/{grid.name}")
Beispiel #28
0
                xmax_preds.append(xmin_proc_win[k] + xmax / 1000.0)
                if prevoicing_decision:
                    mark_preds.append("-{:.12g}".format(float(confidence)))
                else:
                    mark_preds.append("{:.12g}".format(float(confidence)))
            else:  # negative VOT
                xmin_preds.append(xmin_proc_win[k] + xmax / 1000.0)
                xmax_preds.append(xmin_proc_win[k] + xmin / 1000.0)
                mark_preds.append("neg {:.12g}".format(float(confidence)))
            k += 1

        # add "AutoVOT" tier to textgrid_filename
        textgrid = tg.TextGrid()
        textgrid.read(textgrid_file)
        auto_vot_tier = tg.IntervalTier(name='AutoVOT',
                                        minTime=textgrid.minTime,
                                        maxTime=textgrid.maxTime)
        auto_vot_tier.add(textgrid.minTime, xmin_preds[0], '')
        try:
            for i in range(len(xmin_preds) - 1):
                ## instead of mark_preds[i] (confidence number), just put 'pred' in the interval
                auto_vot_tier.add(xmin_preds[i], xmax_preds[i], 'pred')

                auto_vot_tier.add(xmax_preds[i], xmin_preds[i + 1], '')
            ## instead of mark_preds[i] (confidence number), just put 'pred' in the interval

            auto_vot_tier.add(xmin_preds[-1], xmax_preds[-1], 'pred')
            auto_vot_tier.add(xmax_preds[-1], textgrid.maxTime, '')
        except ValueError as e:
            logging.error(
                "Overlapping stops detected, textgrid output stopped at {}".