Esempio n. 1
0
def parse_maus_par(parfilename, sample_rate):
    ort_ivs = []
    mau_ivs = []
    with open(parfilename, 'r') as parfile:
        # print(parfilename)
        parreader = csv.reader(parfile, delimiter='\t', quotechar=None)
        for row in parreader:
            if row[0] == "ORT:":
                oiv = tgt.Interval(0, 0, row[2])
                oiv.has_begin_set = False
                ort_ivs.append(oiv)
                assert len(ort_ivs) == int(row[1]) + 1
            elif row[0] == "MAU:":
                ivbegin = float(row[1]) / sample_rate
                ivend = (float(row[1]) + float(row[2]) + 1) / sample_rate
                wnum = int(row[3])
                # print(wnum, ivbegin, ivend, row[4])
                ort_ivs[wnum].end_time = ivend
                if wnum >= 0 and not ort_ivs[wnum].has_begin_set:
                    ort_ivs[wnum].start_time = ivbegin
                    ort_ivs[wnum].has_begin_set = True
                mau_ivs.append(tgt.Interval(ivbegin, ivend, row[4]))
    if not mau_ivs:
        return [], []
    for iv in ort_ivs:
        assert iv.has_begin_set, "Incomplete MAU tier in %s" % parfilename
    return ort_ivs, mau_ivs
Esempio n. 2
0
    def find_cycles(self, win_len=10, delta=1, lookahead=1,
                    include_holds=True, **kwargs):
        """Locate peaks and troughs in the signal."""

        resp_scaled = self._move_zscore(win_len * self.samp_freq)
        peaks, troughs = peakdetect(resp_scaled, delta=delta,
                                    lookahead=lookahead)

        # Make sure we start with an inhalation and end with an exhalation.
        if peaks[0] < troughs[0]:
            peaks = peaks[1:]
        if peaks[-1] > troughs[-1]:
            peaks = peaks[:-1]

        assert len(peaks) == len(troughs) - 1, \
            'Expected {} peaks, got {}'.format(len(troughs) - 1, len(peaks))

        # Store the results in an IntervalTier.
        inhalations = zip(troughs[:-1], peaks)
        exhalations = zip(peaks, troughs[1:])

        segments = tgt.IntervalTier(name='resp')
        for inh, exh in zip(inhalations, exhalations):
            inh_onset = inh[0] / self.samp_freq
            inh_offset = inh[1] / self.samp_freq
            exh_offset = exh[1] / self.samp_freq

            segments.add_interval(tgt.Interval(inh_onset, inh_offset, 'in'))
            segments.add_interval(tgt.Interval(inh_offset, exh_offset, 'out'))

        self.segments = segments

        if include_holds:
            # Pass kwargs to find_holds.
            self.find_holds(**kwargs)
Esempio n. 3
0
def add_onsets_rhymes(title, input_path, output_path):

    # Load the textgrid
    tg = tgt.read_textgrid(os.path.join(input_path, title + '.TextGrid'))

    # Load name of all tiers
    tier_names = tg.get_tier_names()

    # Select a tier whose name contains 'syllables'
    sylls_tier_name = [name for name in tier_names if 'sylls' in name][0]
    sylls_tier = tg.get_tier_by_name(sylls_tier_name)

    # Select a tier whose name contains 'phones'
    phones_tier_name = [name for name in tier_names if 'phones' in name][0]
    phones_tier = tg.get_tier_by_name(phones_tier_name)

    # Start an empty tier for onset-rhymes
    onset_rhyme_tier = tgt.IntervalTier()
    onset_rhyme_tier_name = [name for name in tier_names
                             if 'words' in name][0].replace('words', 'OR')
    onset_rhyme_tier.name = onset_rhyme_tier_name

    onset_rhyme_intervals = []

    for syll in sylls_tier._get_annotations():

        #print(syll)
        phs = phones_tier.get_annotations_between_timepoints(
            syll.start_time, syll.end_time)

        nucleus_index = calculate_nucleus_index(phs)

        # If the first phone contains a number then it means the whole syll has no onset, so we only add a rhyme
        if nucleus_index == 0:
            onset_rhyme_intervals.append(
                tgt.Interval(syll.start_time, syll.end_time, 'R'))

        # If the onset is present add onset and rhyme intervals
        else:
            onset_rhyme_intervals.append(
                tgt.Interval(syll.start_time, phs[nucleus_index - 1].end_time,
                             'O'))

            onset_rhyme_intervals.append(
                tgt.Interval(phs[nucleus_index].start_time, syll.end_time,
                             'R'))

    # Add all the intervals to the onset rhyme tier
    onset_rhyme_tier.add_annotations(onset_rhyme_intervals)

    # Add the onset rhyme tier to the TextGrid
    tg.add_tier(onset_rhyme_tier)

    # Move syll tier after the onset_rhyme_tier
    tg.delete_tier(sylls_tier_name)
    tg.add_tier(sylls_tier)

    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
Esempio n. 4
0
def read_maus_alignments(tmpdir, offsets, orttier, mautier, sample_rate):
    logging.info("Reading MAUS alignments")
    for i, foffset in enumerate(offsets):
        intervalcnt = i + 1
        parfile = "%s/iv%s.par" % (tmpdir, intervalcnt)
        try:
            ort_ivs, mau_ivs = parse_maus_par(parfile, sample_rate)
            if not ort_ivs and foffset.transcription_valid:
                logging.warning("No alignment imported for interval %s: %s" %
                                (intervalcnt, foffset))
            for iv in ort_ivs:
                orttier.add_annotation(
                    tgt.Interval(iv.start_time + foffset.start_time,
                                 iv.end_time + foffset.start_time, iv.text))
            for iv in mau_ivs:
                mautier.add_annotation(
                    tgt.Interval(iv.start_time + foffset.start_time,
                                 iv.end_time + foffset.start_time, iv.text))
        except IOError:
            if foffset.transcription_valid:
                logging.warning("No alignment imported for interval %s: %s" %
                                (intervalcnt, foffset))
        except:
            logging.error("Exception while parsing TextGrid %s" % parfile)
            raise
Esempio n. 5
0
    def get_textgrid(self):
        tg = tgt.TextGrid()
        t = tgt.IntervalTier(name='Word')
        for w in self.words.segments:
            t.add_interval(tgt.Interval(w.start, w.end, w.text))
        tg.add_tier(t)
        t = tgt.IntervalTier(name='Phoneme')
        for ph in self.phonemes.segments:
            t.add_interval(tgt.Interval(ph.start, ph.end, ph.text))
        tg.add_tier(t)

        return tgt.io.export_to_long_textgrid(tg)
def annotate(textGrid, annotatedTextGrid):
    utterance = annotatedTextGrid.tiers[0][0].text
    annotation = annotatedTextGrid.tiers[2][0].text
    st = textGrid.tiers[0].start_time
    et = textGrid.tiers[0].end_time
    uttInterval = tgt.Interval(start_time=st, end_time=et, text=utterance)
    annInterval = tgt.Interval(start_time=st, end_time=et, text=annotation)
    uttTier = tgt.IntervalTier(start_time=st, end_time=et, name="Utterance")
    annTier = tgt.IntervalTier(start_time=st, end_time=et, name="Annotation")
    uttTier.add_interval(uttInterval)
    annTier.add_interval(annInterval)
    textGrid.add_tier(uttTier)
    textGrid.add_tier(annTier)
    return textGrid
Esempio n. 7
0
    def find_holds(self, min_hold_dur=0.25, min_hold_gap=0.15,
                   peak_prominence=0.05, bins=100):
        """Locate respiratory holds.

        The method is based on the original MATLAB implementation in
        Breathmetrics (https://github.com/zelanolab/breathmetrics),
        adapted to the RIP signal. See also: Noto T, Zhou G, Schuele
        S, Templer J, & Zelano C (2018) Automated analysis of
        breathing waveforms using BreathMetrics: a respiratory signal
        processing toolbox. Chemical Senses (in press).
        """

        self._filt = self.filter_lowpass(cutoff=3, order=8, inplace=False)
        # self._filt = self.res

        # Identify inhalations and exhalation if not present.
        if self.segments is None:
            self.find_cycles()

        hold_cand = []

        for intr in self.segments:

            lo = round(intr.start_time * self.samp_freq)
            hi = round(intr.end_time * self.samp_freq)

            intr_holds = self._find_holds_within_interval(
                lo, hi, peak_prominence, bins)

            if intr_holds is not None:
                hold_cand += [(lo + h[0],  lo + h[1]) for h in intr_holds]

        # Merge holds which lie closer than min_hold_gap and
        # exclude holds shorter than min_hold_dur.
        holds = []
        prev_hold = None

        for h in hold_cand:
            if prev_hold is None:
                prev_hold = h
            elif h[0] - prev_hold[1] < min_hold_gap * self.samp_freq:
                prev_hold = (prev_hold[0], h[1])
            else:
                if prev_hold[1] - prev_hold[0] >= min_hold_dur * self.samp_freq:
                    holds.append(prev_hold)
                prev_hold = h
        if prev_hold[1] - prev_hold[0] >= min_hold_dur * self.samp_freq:
            holds.append(prev_hold)

        # Build a holds tier.
        holds_tier = tgt.IntervalTier(name='holds')
        for lo, hi in holds:
            start = lo / self.samp_freq
            end = hi / self.samp_freq
            # Filter out holds overlapping with speech or inhalation:
            if (self.overlaps_speech(start, end)
                    or self.overlaps_inhalation(start, end)):
                continue
            holds_tier.add_interval(tgt.Interval(start, end, 'hold'))
        self.holds = holds_tier
Esempio n. 8
0
def segment_speech_praat(wavfile,
                         channel=1,
                         denoise=False,
                         trainbegin=0,
                         trainwindow=1,
                         threshold=50,
                         min_sil_duration=0.02,
                         min_snd_duration=0.02):
    vadscript = os.path.join(os.path.dirname(sys.argv[0]), "vad.praat")
    result = util.call_check([
        "praat", "--run", vadscript,
        os.path.realpath(wavfile),
        str(channel),
        str(int(denoise)),
        str(trainbegin),
        str(trainwindow),
        str(threshold),
        str(min_sil_duration),
        str(min_snd_duration)
    ], True)
    speech_chunks = []
    intensities = []
    for line in result.decode().split("\n"):
        items = line.split("\t")
        if line.startswith("silence threshold"):
            logging.info(line)
        elif len(items) > 2:
            if items[0] == "chunk":
                iv = tgt.Interval(float(items[1]), float(items[2]), items[3])
                iv.as_db = float(items[3])
                speech_chunks.append(iv)
            elif items[0] == "itn":
                intensities.append(
                    IntensityVal(float(items[1]), float(items[2])))
    return speech_chunks, intensities
Esempio n. 9
0
def split_utterances(tmpdir, speechtiername, infile, wavfile, channel,
                     denoise):
    logging.info("Splitting audio into utterance segments")
    splitaudioscript = os.path.join(os.path.dirname(sys.argv[0]),
                                    "splitaudio.praat")
    if denoise:
        logging.warning("Assuming 1-4 seconds are non-speech for denoising")
    result = util.call_check([
        "praat", "--run", splitaudioscript,
        os.path.realpath(wavfile),
        os.path.realpath(infile),
        "%s/iv" % tmpdir,
        str(channel), speechtiername,
        str(int(denoise))
    ], True)
    offsets = []
    for line in result.decode().split("\n"):
        items = line.split("\t")
        if len(items) == 3:
            foffset = tgt.Interval(start_time=float(items[0]),
                                   end_time=float(items[1]),
                                   text=items[2])
            offsets.append(foffset)
    logging.info("Split completed: %s segments" % len(offsets))
    return offsets
Esempio n. 10
0
    def _merge_holds(cycles, holds):
        """Merge respiratory holds with the inhalation and exhalation
        boundaries."""

        i, j = 0, 0
        cycles = tgt.IntervalTier()
        cur_intr = None
        while i < len(cycles) and j < len(holds):

            if cycles:
                c_start = max(cycles[-1].end_time, cycles[i].start_time)
            else:
                c_start = cycles[i].start_time
            c_end = min(cycles[i].end_time, holds[j].start_time),
            cur_intr = tgt.Interval(c_start, c_end, cycles[i].text)

            if cur_intr.start_time < holds[j].start_time:
                cycles.add_interval(cur_intr)
            if cycles[i].end_time > holds[j].start_time:
                cycles.add_interval(holds[j])
                j += 1
            if cycles[i].end_time <= cycles[-1].end_time:
                i += 1

        return cycles
Esempio n. 11
0
def add_lemmas(title, input1_path, output_path):

    # Load textgrid
    tg = tgt.read_textgrid(os.path.join(input1_path, title + '.TextGrid'))
    tier_names = tg.get_tier_names()

    # Load pos tier
    pos_tier_name = [name for name in tier_names if 'pos' in name][0]
    pos_tier = tg.get_tier_by_name(pos_tier_name)

    # Load words tier
    words_tier_name = [name for name in tier_names if 'words' in name][0]
    words_tier = tg.get_tier_by_name(words_tier_name)

    # Start empty lemmas tier
    lemmas_tier = tgt.IntervalTier()
    lemmas_tier_name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'lemmas')
    lemmas_tier.name = lemmas_tier_name

    # Generate lemma intervals
    lemmas_intervals = [
        tgt.Interval(w_interval.start_time, w_interval.end_time,
                     lemmatize_word(w_interval.text, pos_tier[i].text))
        for i, w_interval in enumerate(words_tier)
    ]

    # Add lemmas to tier
    lemmas_tier.add_annotations(lemmas_intervals)
    tg.add_tier(lemmas_tier)

    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
Esempio n. 12
0
def save_intervalframe_to_textgrid(framedict, filepath, encoding='utf-8'):
    """Write a dict of IntervalFrames in a textgrid-File.

       Arguments:
       framedict    --  Dictionary of dataframes. The keys become tier
                        names in the textgrid file
       filepath     --  Path + filename of the file to be written.

       Keyword arguments:
       encoding: character encoding to save textgrid file

    """

    if len(framedict) < 1:
        print "invalid data!"
        return
    mytextgrid = tgt.TextGrid()
    for tier_name in framedict.keys():
        newtier = framedict[tier_name]
        if len(newtier.columns) == 3:
            mytier = tgt.IntervalTier(name=tier_name)
            for row in newtier.index:
                myinterval = tgt.Interval(newtier[newtier.columns[0]][row],
                                          newtier[newtier.columns[1]][row],
                                          newtier[newtier.columns[2]][row])
                mytier.add_interval(myinterval)
        elif len(newtier.columns) == 2:
            mytier = tgt.PointTier(name=tier_name)
            for row in newtier.index:
                mypoint = tgt.Point(newtier[newtier.columns[0]][row],
                                    newtier[newtier.columns[1]][row])
                mytier.add_point(mypoint)
        mytextgrid.add_tier(mytier)
    tgt.write_to_file(mytextgrid, filepath, encoding=encoding, format="long")
Esempio n. 13
0
 def prepare_textgrid(self, df, offset):
     grid = tgt.TextGrid()
     tier = tgt.IntervalTier()
     tier.name = "Context"
     grid.add_tier(tier)
     for x in df.index:
         start = df.loc[x]["coq_word_starttime_1"]
         end = df.loc[x]["coq_word_endtime_1"]
         text = df.loc[x]["coq_word_label_1"]
         interval = tgt.Interval(start - offset, end - offset)
         interval.text = text
         tier.add_interval(interval)
     return grid
Esempio n. 14
0
def writeTextGrid(outfile, word_alignments):
    tg = tgt.TextGrid()
    phone_tier = tgt.IntervalTier(name='phone')
    word_tier = tgt.IntervalTier(name='word')

    for data in word_alignments:
        word = data[0]
        phones = data[1:]

        if len(phones) > 0:
            start_time = phones[0][1]
            end_time = phones[-1][2]

            word_tier.add_interval(
                tgt.Interval(start_time, end_time, text=word))

            for (p, p_start, p_end) in phones:
                phone_tier.add_interval(tgt.Interval(p_start, p_end, text=p))
    tg.add_tier(phone_tier)
    tg.add_tier(word_tier)

    tgt.io.write_to_file(tg, outfile, format='long')
Esempio n. 15
0
    def find_holds(self, min_hold_dur=0.25, min_hold_gap=0.15,
                   peak_prominence=0.05, bins=100):

        # Identify inhalations and exhalation if not present.
        if self.segments is None:
            self.find_cycles()

        hold_cand = []
        # seg_samp = np.concatenate(
        #     (np.stack([self._troughs[:-1], self._peaks], axis=1),
        #      np.stack([self._peaks, self._troughs[1:]], axis=1)))

        for intr in self.segments:

            lo = round(intr.start_time * self.samp_freq)
            hi = round(intr.end_time * self.samp_freq)

            intr_holds = self._find_holds_within_interval(
                lo, hi, peak_prominence, bins)

            if intr_holds is not None:
                hold_cand += [(lo + h[0],  lo + h[1]) for h in intr_holds]

        if not hold_cand:
            return

        # Merge holds which lie closer than min_hold_gap and
        # exclude holds shorter than min_hold_dur.
        holds = []
        prev_hold = None

        for h in hold_cand:
            if prev_hold is None:
                prev_hold = h
            elif h[0] - prev_hold[1] < min_hold_gap * self.samp_freq:
                prev_hold = (prev_hold[0], h[1])
            else:
                if prev_hold[1] - prev_hold[0] >= min_hold_dur * self.samp_freq:
                    holds.append(prev_hold)
                prev_hold = h
        if prev_hold[1] - prev_hold[0] >= min_hold_dur * self.samp_freq:
            holds.append(prev_hold)

        # Build a holds t
        holds_tier = tgt.IntervalTier(name='holds')
        for lo, hi in  holds:
            holds_tier.add_interval(tgt.Interval(lo / self.samp_freq, hi / self.samp_freq, 'hold'))
        self.holds = holds_tier
Esempio n. 16
0
 def format(self, syncmap):
     try:
         import tgt
     except ImportError as exc:
         self.log_exc(u"Python module tgt is not installed", exc, True, ImportError)
     # from https://github.com/hbuschme/TextGridTools/blob/master/tgt/io.py
     textgrid = tgt.TextGrid()
     tier = tgt.IntervalTier(name="Token")
     for fragment in syncmap.fragments:
         begin = float(fragment.begin)
         end = float(fragment.end)
         text = fragment.text_fragment.text
         if text == u"":
             text = u"SIL"
         interval = tgt.Interval(begin, end, text=text)
         tier.add_interval(interval)
     textgrid.add_tier(tier)
     if self.variant == self.DEFAULT:
         msg = tgt.io.export_to_long_textgrid(textgrid)
     else:
         msg = tgt.io.export_to_short_textgrid(textgrid)
     return gf.safe_unicode(msg)
def add_pos(title, input1_path, input2_path, output_path):

    # Load the textgrid
    tg = tgt.read_textgrid(os.path.join(input1_path, title + '.TextGrid'))

    # Load name of all tiers
    tier_names = tg.get_tier_names()

    # Select a tier whose name contains 'words'
    words_tier_name = [name for name in tier_names if 'words' in name][0]
    words_tier = tg.get_tier_by_name(words_tier_name)

    # Start an empty tier for POS_tags
    pos_tier = tgt.IntervalTier()
    pos_tier_name = [name for name in tier_names
                     if 'words' in name][0].replace('words', 'pos')
    pos_tier.name = pos_tier_name

    # Extract words intervals
    word_intervals = [w for w in words_tier._get_annotations()]

    # Extract words
    words = [w.text for w in words_tier._get_annotations()]

    # Load text
    txt = ''
    with open(os.path.join(input2_path, title + '.txt'), 'r',
              encoding='utf-8') as f:
        for l in f:
            l = ' '.join(l.split())
            for char in l.replace('\n', ' ').replace('\t', ' '):
                txt += char

    # Try to use my own tagger from txt and see if it matches the words in the original word tier
    # If they don't match just use the list of words from the tier and feed them to the tagger (this option is less accurate)

    my_tags = my_tagger(txt)
    if len(my_tags) == len(words):

        # True for every mismatch between words in words_tier and words produced by my_tagger
        mismatches = [
            True for i, tag in enumerate(my_tags) if tag[0] != words[i]
        ]

        # If everything matches up we can use my_tags, else we resort to the vanilla nltk one
        if True not in mismatches:
            POS_tags = my_tags

        else:
            POS_tags = nltk.pos_tag(words)

    else:
        print(title)
        POS_tags = nltk.pos_tag(words)

    pos_intervals = [
        tgt.Interval(interval.start_time, interval.end_time, POS_tags[i][1])
        for i, interval in enumerate(word_intervals)
    ]

    pos_tier.add_annotations(pos_intervals)

    tg.add_tier(pos_tier)

    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
Esempio n. 18
0
    def process_dataframe(self, df, grid, offset, end_time, left_padding,
                          right_padding, remember_time, grid_id):
        """
        Fill the grid by using the content of the data frame.
        """
        corpus_features = [x for x, _ in self.resource.get_corpus_features()]

        data_columns = [
            x for x in df.columns
            if ("_starttime_" not in x and "_endtime_" not in x)
        ]

        max_stop = end_time + left_padding + right_padding

        for col in data_columns:
            interval = None
            # add the corpus IDs if no real feature is selected:
            if col == "coquery_invisible_corpus_id":
                if self._artificial_corpus_id:
                    tier_name = "corpus_id"
                else:
                    continue
                number = 1
            elif col.startswith("coquery_invisible"):
                continue
            elif col.startswith(("func", "coquery", "db")):
                tier_name = self.session.translate_header(col)
            else:
                s = col.partition("coq_")[-1]
                rc_feature, _, number = s.rpartition("_")
                _, tab, feature = (
                    self.resource.split_resource_feature(rc_feature))
                tier_name = "{}_{}".format(tab, feature)

            tier = grid.get_tier_by_name(tier_name)
            if (not tier_name.startswith("segment")
                    and tier_name in corpus_features
                    and not self.resource.is_tokenized(tier_name)):
                # corpus feature -- add one interval that
                # covers the whole text grid
                content = utf8(df[col].values[0])
                stop = max_stop
                interval = tgt.Interval(0, stop, content)
                if len(tier.intervals) == 0:
                    tier.add_interval(interval)
            else:
                if not tier_name.startswith("segment"):
                    # lexical feature -- add one interval per entry
                    for i in df.index:
                        row = df.loc[i]
                        dtype = df.dtypes[col]
                        try:
                            val = utf8(row[col].astype(dtype))
                        except AttributeError:
                            val = utf8(row[col])
                        try:
                            label_s, label_e = self.feature_timing[tier_name]
                            start_col = "coq_{}_{}".format(label_s, number)
                            end_col = "coq_{}_{}".format(label_e, number)
                            start = left_padding - offset + row[start_col]
                            stop = left_padding - offset + row[end_col]
                        except KeyError:
                            start = 0
                            stop = max_stop

                        interval = tgt.Interval(start, stop, val)
                        try:
                            tier.add_interval(interval)
                        except ValueError as e:
                            # ValueErrors occur if the new interval overlaps
                            # with a previous interval.
                            # This can happen if no word boundaries are
                            # selected in a multi-word query.
                            pass
                            #logger.warn("{}: {} ({})".format(
                            #self.session.translate_header(tier.name),
                            #e, grid_id))
                else:
                    # segment features
                    start_label, end_label = self.feature_timing[tier_name]
                    start_col = "coq_{}_1".format(start_label)
                    end_col = "coq_{}_1".format(end_label)
                    for i in df.index:
                        row = df.loc[i]
                        val = utf8(row[col])
                        try:
                            start = row[start_col]
                            end = row[end_col]
                        except KeyError:
                            start = 0
                            end = end_time

                        interval = tgt.Interval(left_padding - offset + start,
                                                left_padding - offset + end,
                                                val)
                        try:
                            tier.add_interval(interval)
                        except ValueError as e:
                            logger.warn("{}: {} ({})".format(
                                self.session.translate_header(tier.name), e,
                                grid_id))
            if interval:
                # make sure that the tier is always correctly padded to the
                # right:
                tier.end_time = max(tier.end_time,
                                    interval.end_time + right_padding)
                tier.end_time = min(tier.end_time, max_stop)

        if remember_time:
            tier = grid.get_tier_by_name("Original timing")
            str_start = utf8(offset - left_padding)
            str_end = utf8(offset + grid.end_time - left_padding)
            tier.add_point(tgt.Point(0, str_start))
            tier.add_point(tgt.Point(grid.end_time, str_end))

        return grid
Esempio n. 19
0
def duration(path, C_list, V_list, cid):
    #file_list = glob.glob(path + r"\*\sent\*.TextGrid")  # glob匹配所有的符合条件的文件,并将以list的形式返回
    file_list = glob.glob(path + r"\sent\*.TextGrid")  # glob匹配所有的符合条件的文件,并将以list的形式返回
    #print(file_list)
    AlldeltS = []  # 依次计算 每一个 textgrid 的 结果值 把结果存在 总列表中
    all_vs = []
    all_rpvis = []  # 可能有点问题 每次累加进去 一个值 但是 不清空
    all_npvis = []
    all_ms = []
    for file in file_list:
        TextGrid = tgt.read_textgrid(file, include_empty_intervals=True)  # 依次读取TextGrid文件
        if cid == 'jp':
            tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[0])
            #print(tier)
        if cid == 'cn':
            tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[0])
            #print(tier)
        elif cid == 'ru':
            tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[1])  # 根据 tier的 name/位置 读取 intervals
            #print(tier)
        # tier = TextGrid.get_tier_by_name('SY')
        tier_name = TextGrid.get_tier_names()  # 获取全部的tier 名字
        start = tier.start_time
        end = tier.end_time
        start_syl = tier.start_time
        end_syl = tier.end_time
        tier2insert = tgt.IntervalTier(start, end, name='CV')  # 获取起始点和 终点 插入一条 CV的 intervals
        TextGrid.insert_tier(tier2insert, 3)
        CV = TextGrid.get_tier_by_name('CV')
        annotation = tier.intervals  # 插入一个 intervals
        #syllable = tier_syll.intervals
        num = []
        S_duration = []  # syllable_duration
        duration_all_S = 0  # 全部时长和 (用于计算 %V 和 其他相关参数)

        for i in range(len(annotation)):  # 循环 替换 和 计算 时长
            old_name = annotation[i].text
            old_start_time = annotation[i].start_time
            old_end_time = annotation[i].end_time
            duration = old_end_time - old_start_time
            #if old_name in C_list:  # 判断 属于 C / V
            if old_name != 'sil':
                new_name = 'S'
            # elif old_name in V_list:  # 判断 属于 C / V
            #     new_name = 'S'
            else:
                new_name = 'none'

            # print(old_name, new_name)
            Interval = tgt.Interval(old_start_time, old_end_time, text=new_name)  # interval格式- 依次填写
            # print(old_name, new_name, 'duration=', duration)
            if new_name == 'S':
                S_duration.append(duration)  # 加入 duration 的list
                duration_all_S = duration_all_S + duration

            CV.add_interval(Interval)  # 将 intervals 的标注 >> 到 textgrid
        #print(file, S_duration)
        mean_syl = duration_all_S/len(S_duration)
        #print(mean_syl)
        vacS = duration_all_S / len(S_duration)
        # print(num)
        # if num > 0:
        # mean_syl = a / (len(C_duration) + len(V_duration)) # 计算一个 mean_syllable duration 用于 语速
        # print(mean_syl)
        # mean_syl = a/(len(C_duration)+len(V_duration))
        # print(mean_syl)
        #       vacroC = round(deltaC(C_duration) / mean_syl * 100, 4)
        #       vacroV = round(deltaC(V_duration) / mean_syl * 100, 4)
        vacroS = round(deltaS(S_duration) / vacS * 100, 4)
        # print(file, ',',
        #
        #       deltaS(S_duration), ',',
        #
        #       vacroS, ',',
        #
        #       rPVI_s(S_duration), ',',
        #
        #       nPVI_S(S_duration), ',',
        #       )
        #print(nPVI_S(S_duration))
        AlldeltS.append(deltaS(S_duration))
        all_vs.append(vacroS)
        all_rpvis.append(rPVI_s(S_duration))
        all_npvis.append(nPVI_S(S_duration))
        all_ms.append(mean_syl)
    deltS = round(np.mean(AlldeltS), 9)
    vs = round(np.mean(all_vs), 9)
    rpvis = round(np.mean(all_rpvis), 9)
    npvis = round(np.mean(all_npvis), 9)
    ms = round(np.mean(all_ms), 9)

    #
    print(path, ',',
          ms, ',',
          deltS, ',',
          vs, ',',
          rpvis, ',',
          npvis, ',',
          )
def add_punctuation(title, textgrid_path, txt_path, output_path):

    txt = ''
    with open(os.path.join(txt_path, title + '.txt'), 'r',
              encoding='utf-8') as f:
        for l in f:
            l = ' '.join(l.split())
            for char in l.replace('\n', ' ').replace('\t', ' ').lower():
                txt += char

    word_non_words = [detect_non_words(w) for w in txt.split()]

    # Exclude non-words such as -' , - etc.
    txt_words = [w for w in word_non_words if w != '<punct>']

    # Strip words of punctuation before and after the first/last alphanum
    txt_words = [clean_word(w, title, txt) for w in txt_words]

    tg = tgt.read_textgrid(os.path.join(textgrid_path, title + '.TextGrid'))

    # Load name of all tiers
    tier_names = tg.get_tier_names()

    # Select a tier whose name contains 'words'
    word_tier_name = [name for name in tier_names if 'words' in name][0]
    word_tier = tg.get_tier_by_name(word_tier_name)
    word_list = [w.text for w in word_tier._get_annotations()]

    if len(word_list) == len(txt_words):

        w_indices = []
        w_indices.append(0)
        start = 0
        for lw in txt_words:
            idx = txt.find(lw, start, len(txt))
            start = idx + len(lw)
            w_indices.append(idx)
            w_indices.append(idx + len(lw))
        w_indices.append(len(txt))

        p_indices = [[w_indices[i], w_indices[i + 1]]
                     for i in range(0,
                                    len(w_indices) - 1, 2)]
        punctuation = [txt[i[0]:i[1]].replace(' ', '') for i in p_indices]
        punctuation[0] = 'start' + punctuation[0]
        punctuation[-1] = punctuation[-1] + 'end'
        punctuation = [p if p != '' else '_' for p in punctuation]

        bp = punctuation[0:-1]
        fp = punctuation[1:]

        word_durations = []
        for w in word_tier._get_annotations():
            word_durations.append(w)

        # here we go thru this list ([[w_dur1, w_dur2, etc.], [w_dur1, w_dur2, etc.], etc]) and we keep the first and the last duration of every word
        bp_tier = tgt.IntervalTier()
        bp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'bp')
        bp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, bp[i])
            for i in range(0, len(word_durations))
        ]
        bp_tier.add_annotations(bp_intervals)
        tg.add_tier(bp_tier)

        fp_tier = tgt.IntervalTier()
        fp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'fp')
        fp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, fp[i])
            for i in range(0, len(word_durations))
        ]
        fp_tier.add_annotations(fp_intervals)
        tg.add_tier(fp_tier)

    else:

        word_durations = []
        for w in word_tier._get_annotations():
            word_durations.append(w)

        bp = ['start'] + ['<unk>' for i in range(len(word_durations) - 1)]
        fp = ['<unk>' for i in range(len(word_durations) - 1)] + ['end']

        bp_tier = tgt.IntervalTier()
        bp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'bp')
        bp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, bp[i])
            for i in range(0, len(word_durations))
        ]
        bp_tier.add_annotations(bp_intervals)
        tg.add_tier(bp_tier)

        fp_tier = tgt.IntervalTier()
        fp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'fp')
        fp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, fp[i])
            for i in range(0, len(word_durations))
        ]
        fp_tier.add_annotations(fp_intervals)
        tg.add_tier(fp_tier)

    # For now we generate the modified TextGrids in the same folder is the old ones. Later, sent the new files into a new folder
    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
Esempio n. 21
0
def annotate(title, xml_path, textgrid_path, annotations_path):
    try:

        tree = ET.parse(os.path.join(xml_path, title + '.xml'))
        root = tree.getroot()

        stress_phone_seq = [
        ]  # The content here come from the xml file. Output format:  [[ph1, ph2, etc.], [ph1, ph2, etc.], etc]
        stress_seq = [
        ]  # The content here come from the xml file. Output format:  [[0], [2], [1],etc]
        for p in root[0]:
            for s in p:
                for phrase in s:
                    for word in phrase:
                        # get rid of words in xml that lack a phonemic counterpart in the textGrid
                        if word.text not in ('!', ',', '-', '.', '..', '...',
                                             ':', '?'):
                            for syllable in word:
                                stress_phone_group = []
                                stress_group = []
                                stress_group.append(syllable.attrib['stress'])
                                stress_seq.append(stress_group)
                                for ph in syllable:
                                    stress_phone_group.append(ph.attrib['p'])
                                stress_phone_seq.append(stress_phone_group)

        tg = tgt.read_textgrid(os.path.join(textgrid_path,
                                            title + '.TextGrid'))
        phones_tier = tg.get_tier_by_name('phones')
        word_tier = tg.get_tier_by_name('words')

        #word_durations = [w for w in word_tier._get_annotations() if w.text != '-'] # use this instead of the next snippet if you remove '-' from the vocabulary. Atm '-' is mapped to 'min@s'
        word_durations = []
        dash_intervals = []
        for w in word_tier._get_annotations():
            if w.text == '-':
                dash_intervals.append(w)
            else:
                word_durations.append(w)
        for dash in dash_intervals:
            # Here we delete all the phone annotation that are read out as "minus", if you don't u mess up the alignment
            phones_tier.delete_annotations_between_timepoints(
                dash.start_time,
                dash.end_time,
                left_overlap=False,
                right_overlap=False)

        phone_durations = [
            p for p in phones_tier._get_annotations() if p.text != 'sil'
        ]

        # here we gather the phone durations following the same format as pos_phone_seq, i.e. [[ph_dur1, ph_dur2, etc.], [ph_dur1, ph_dur2, etc.], etc]

        #print([j for i in stress_phone_seq for j in i])
        #print([i.text for i in phone_durations])

        l = []
        k = -1
        for i in range(0, len(stress_phone_seq)):
            m = []
            for j in range(0, len(stress_phone_seq[i])):
                k += 1
                m.append(phone_durations[k])
            l.append(m)

        # here we go thru this list ([[ph_dur1, ph_dur2, etc.], [ph_dur1, ph_dur2, etc.], etc]) and we keep the first and the last duration of every syllable
        syl_durations = [(syl[0].start_time, syl[-1].end_time) for syl in l]
        syllable_tier = tgt.IntervalTier()
        syllable_tier.name = 'syllables'
        syllable_tier.start_time = phones_tier.start_time
        syllable_tier.end_time = phones_tier.end_time
        syllable_intervals = [
            tgt.Interval(syl_durations[i][0], syl_durations[i][1],
                         str(stress_seq[i][0]))
            for i in range(0, len(syl_durations))
        ]
        syllable_tier.add_annotations(syllable_intervals)

        for phone in phones_tier:
            phone.text = phone.text.replace('Q', '@@').replace('ts',
                                                               't').replace(
                                                                   'sp', 'sil')

        vowels = [
            '@', '@@', 'a', 'aa', 'ai', 'au', 'e', 'e@', 'ei', 'i', 'i@', 'ii',
            'o', 'oi', 'oo', 'ou', 'u', 'u@', 'uh', 'uu'
        ]

        for phone in phones_tier:
            if phone.text in vowels:

                phone_centre = phone.start_time + (phone.end_time -
                                                   phone.start_time) / 2
                phone.text = phone.text + syllable_tier.get_annotations_by_time(
                    phone_centre)[0].text

        # For now we generate the modified TextGrids in the same folder is the old ones. Later, sent the new files into a new folder
        newTitle = os.path.join(annotations_path, title + '.TextGrid')
        tgt.write_to_file(tg, newTitle, format='short')
    except:
        pass
Esempio n. 22
0
def segment_speech(infile, outfile, wavfile, channel, filtertiername,
                   shiftonset, shiftoffset, denoise, trainbegin, trainwindow,
                   speechthresh, snradd):
    logging.info("Segmenting speech in %s" % wavfile)
    duration, _ = util.get_wav_duration(wavfile)
    tg, tier = util.init_textgrid(infile, duration, "seg.speech")

    logging.info("Floor estimation...")
    _, intensities = segment_speech_praat(wavfile,
                                          channel,
                                          denoise=denoise,
                                          trainbegin=trainbegin,
                                          trainwindow=trainwindow)
    silencelevel = find_silence_level(intensities, trainwindow) + snradd
    logging.info("estimated floor noise level: %s" % silencelevel)
    logging.info("Segmentation...")
    speech_chunks, intensities = segment_speech_praat(wavfile,
                                                      channel,
                                                      threshold=silencelevel,
                                                      denoise=denoise,
                                                      trainbegin=trainbegin,
                                                      trainwindow=trainwindow)
    for iv in speech_chunks:
        tier.add_annotation(iv)

    dbvalues = [x.as_db - silencelevel for x in tier]
    dbfilterthreshold = silencelevel + (sum(dbvalues) / len(dbvalues) *
                                        speechthresh)
    logging.info("speech filtering threshold: %s" % dbfilterthreshold)
    logging.info("vad segments: %s" % len(tier))

    if filtertiername is None:
        filtertier = tgt.IntervalTier()
        filtertier.add_annotation(
            tgt.Annotation(tier.start_time, tier.end_time, "speech"))
        filtertiername = "<all>"
    else:
        filtertier = tg.get_tier_by_name(filtertiername)
    resulttier = tgt.IntervalTier(name="seg.speech")
    speechsegments = [s for s in filtertier if s.text == "speech"]
    logging.info("expected speech segments: %s" % len(speechsegments))
    stats_filtered = 0
    stats_all = 0
    for speechseg in speechsegments:
        speechivs = tier.get_annotations_between_timepoints(
            speechseg.start_time, speechseg.end_time)
        if len(speechivs) == 0:
            speechivs = tier.get_annotations_between_timepoints(
                speechseg.start_time,
                speechseg.end_time,
                left_overlap=True,
                right_overlap=True)
            if len(speechivs) > 0:
                logging.warning(
                    "Speech segments overlap with the boundaries of %s in %s. "
                    "VAD problem? Shortening..." % (speechseg, filtertiername))
                for siv in speechivs:
                    siv.start_time = max(speechseg.start_time, siv.start_time)
                    siv.end_time = min(speechseg.end_time, siv.end_time)

        if len(speechivs) == 0:
            logging.warning("No speech segments in %s overlap with %s" %
                            (filtertiername, speechseg))
            continue

        dbfilteredivs = tgt.IntervalTier()
        for siv in [x for x in speechivs if x.as_db > dbfilterthreshold]:
            dbfilteredivs.add_annotation(siv)
        stats_filtered += len(speechivs) - len(dbfilteredivs)
        stats_all += len(speechivs)
        if len(dbfilteredivs) == 0:
            logging.warning(
                "All speech segments in %s dropped since their energy is below %.2f"
                % (speechseg, dbfilterthreshold))
            continue
        start_time = min([x.start_time for x in dbfilteredivs])
        end_time = max([x.end_time for x in dbfilteredivs])

        resulttier.add_annotation(
            tgt.Interval(start_time + shiftonset, end_time + shiftoffset,
                         "speech"))
    assert stats_all > 0, "VAD was unable segment speech. Check silence region: calculated threshold %.2f db. " \
                          "Speech threshold: %.2f db." % (silencelevel, dbfilterthreshold)
    tier = resulttier
    logging.info(
        "Dropped %d of %d speech segments (%.2f%%) with energy below %.2f db" %
        (stats_filtered, stats_all,
         (stats_filtered / stats_all * 100), dbfilterthreshold))

    tier.name = "seg.speech"
    tg.add_tier(tier)
    logging.info("Writing %s" % outfile)
    tgt.io.write_to_file(textgrid=tg, filename=outfile, format="long")
# Usage: python segment_laughter.py <input_audio_file> <stored_model_path> <output_folder> <save_to_textgrid>

if __name__ == '__main__':
    if parse_inputs():
        input_path, model_path, output_path, threshold, min_length, save_to_textgrid = parse_inputs(
        )
        min_length = seconds_to_frames(min_length)

        laughs = laugh_segmenter.segment_laughs(input_path, model_path,
                                                output_path, threshold,
                                                min_length, save_to_textgrid)
        print("found %d laughs." % (len(laughs)))

        if not save_to_textgrid:
            for laugh in laughs:
                print(laugh)
        else:
            tg = tgt.TextGrid()
            laughs_tier = tgt.IntervalTier(
                name='laughter',
                objects=[
                    tgt.Interval(l['start'], l['end'], 'laugh') for l in laughs
                ])
            tg.add_tier(laughs_tier)
            fname = os.path.splitext(os.path.basename(input_path))[0]
            tgt.write_to_file(
                tg, os.path.join(output_path, fname + '_laughter.TextGrid'))

            print('Saved laughter segments in {}'.format(
                os.path.join(output_path, fname + '_laughter.TextGrid')))
def add_syllables(title, input_path, syllabification_file_path, output_path):

	# Load language syllable structure for the syllabifier
	with open(syllabification_file_path) as f:   
		language_syllables =  json.load(f)


	# Load the textgrid
	tg = tgt.read_textgrid(os.path.join(input_path,title+'.TextGrid'))

	# Load name of all tiers
	tier_names = tg.get_tier_names()

	# Select a tier whose name contains 'words'
	words_tier_name = [name for name in tier_names if 'words' in name][0]
	words_tier = tg.get_tier_by_name(words_tier_name)

	# Select a tier whose name contains 'phones'
	phones_tier_name = [name for name in tier_names if 'phones' in name][0]
	phones_tier = tg.get_tier_by_name(phones_tier_name)

	# Start an empty tier for syllables
	syllable_tier = tgt.IntervalTier()
	syll_tier_name = [name for name in tier_names if 'words' in name][0].replace('words', 'sylls')
	syllable_tier.name = syll_tier_name

	# Syllabify one word at a time
	for w in words_tier._get_annotations():
		
		# For the current word, get all of its phones
		phs = phones_tier.get_annotations_between_timepoints(w.start_time, w.end_time)
		for ph in phs: 
			if ph.text == 'spn':
				ph.text = 'aa1'


		# Transform the string of phones into a string of syllables
		# Format: ph1 ph2 . ph3 ph4 ph5 . ph6 etc.
		s = stringify(syllabify(' '.join([ph.text for ph in phs]), language_syllables))

		# From string of syllables to a nested lists of phone indeces
		# Format: [[ph1_idx, ph2_idx, etc.], [ph3_idx, ph4_idx, etc.], etc.]

		sylls = [syll.split() for syll in s.split('.')]
		i = 0
		sylls_indeces = []
		for j, syll in enumerate(sylls):
			syll_indeces = []
			for k in range(0, len(syll)):
				syll_indeces.append(int(i))
				i += 1
			sylls_indeces.append(syll_indeces)

		# Extract the relevant intervals using the indeces
		sylls_intervals = [[phs[index] for index in ph_group] for ph_group in sylls_indeces]

		# Extract the stress for each syllable:
		# Format: [['0'], ['1'], etc.]
		sylls_stresses = [[char for char in ''.join(ph_group) if char.isdigit()==True] for ph_group in sylls]
		sylls_stresses = [ph_group if ph_group != [] else ['0'] for ph_group in sylls_stresses]

		#print(w)
		#print(sylls_indeces)
		#print(sylls_stresses)
		#print(sylls_intervals)

		syllable_intervals = [tgt.Interval(interval[0].start_time, interval[-1].end_time, str(sylls_stresses[i][0])) for i, interval in enumerate(sylls_intervals)]

		#print(syllable_intervals)
		syllable_tier.add_annotations(syllable_intervals)

	tg.add_tier(syllable_tier)

	tgt.write_to_file(tg, os.path.join(output_path,title+'.TextGrid'), format='short')
Esempio n. 25
0
def duration(path, C_list, V_list, mode, cid):
    # path="E:\coorpus"
    # path_cn = 'F:\SAITCorpus\CN'
    # path = 'F:\SAITCorpus'
    #if mode = # 传递 参数
    #if len(path) < 14: # 判断是否 是 计算国家 / 发音人( 路径结构略不一样)
    if mode == 'country':
        file_list = glob.glob(
            path + r"\*\sent\*.TextGrid")  # glob匹配所有的符合条件的文件,并将以list的形式返回 国家
    elif mode == 'spker':
        file_list = glob.glob(
            path + r"\sent\*.TextGrid")  # glob匹配所有的符合条件的文件,并将以list的形式返回 国家
    #else:  #ditto
    #file_list=glob.glob(path + r"\sent\*.TextGrid") # 发音人;

    # print('filename',',', '%V\t',',', 'deltaC\t', ',', 'deltaV\t')
    # print(file_list)

    AlldeltC = []  # 依次计算 每一个 textgrid 的 结果值 把结果存在 总列表中
    AlldeltV = []
    all_vc = []
    all_vv = []
    all_rpvic = []
    all_rpviv = []  # 可能有点问题 每次累加进去 一个值 但是 不清空
    all_npvic = []
    all_npviv = []
    all_pctV = []
    for file in file_list:
        TextGrid = tgt.read_textgrid(
            file, include_empty_intervals=True)  # 依次读取TextGrid文件
        if cid == 'cn':
            tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[2])
            #print(tier)
        elif cid == 'jp':
            tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[1])
        elif cid == 'ru':
            tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[2])
        #tier_syll = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[1]) #根据 tier的 name/位置 读取 intervals
        #tier = TextGrid.get_tier_by_name('SY')
        tier_name = TextGrid.get_tier_names()  # 获取全部的tier 名字
        start = tier.start_time
        end = tier.end_time
        tier2insert = tgt.IntervalTier(
            start, end, name='CV')  # 获取起始点和 终点 插入一条 CV的 intervals
        TextGrid.insert_tier(tier2insert, 3)
        CV = TextGrid.get_tier_by_name('CV')
        annotation = tier.intervals  # 插入一个 intervals
        num = []
        C_duration = []  # 每一个 intervals 的 时长信息 表
        V_duration = []
        duration_all_C = 0  # 全部时长和 (用于计算 %V 和 其他相关参数)
        duration_all_V = 0
        for i in range(len(annotation)):  # 循环 替换 和 计算 时长
            old_name = annotation[i].text
            old_start_time = annotation[i].start_time
            old_end_time = annotation[i].end_time
            duration = old_end_time - old_start_time

            if old_name in C_list:  # 判断 属于 C / V
                new_name = 'C'
            elif old_name in V_list:  # 判断 属于 C / V
                new_name = 'V'
            else:
                new_name = 'none'

            Interval = tgt.Interval(old_start_time,
                                    old_end_time,
                                    text=new_name)  # interval格式- 依次填写
            # print(old_name, new_name, 'duration=', duration)
            if new_name == 'C':
                C_duration.append(duration)  #加入 duration 的list
                duration_all_C = duration_all_C + duration

            elif new_name == 'V':
                V_duration.append(duration)
                duration_all_V += duration
            CV.add_interval(Interval)  # 将 intervals 的标注 >> 到 textgrid
        a = duration_all_V + duration_all_C  # 句子时长(去除 sli)
        pctV = duration_all_V / a
        #mean_syl = a / (len(C_duration) + len(V_duration)) # 计算一个 mean_syllable duration 用于 语速
        #print(mean_syl)
        #print(len(C_duration))

        vacC = duration_all_C / len(C_duration)
        vacV = duration_all_V / len(V_duration)
        #vacroC = round(deltaC(C_duration) / vacC * 100, 4)
        #vacroV = round(deltaC(V_duration) / vacV * 100, 4)
        vacroC = deltaC(C_duration) / vacC * 100
        vacroV = deltaC(V_duration) / vacV * 100
        # print(file, ',',
        #
        #       deltaC(C_duration), ',',
        #       deltaV(V_duration), ',',
        #
        #       vacroC, ',',
        #       vacroV, ',',
        #
        #       rPVI_c(C_duration), ',',
        #       rPVI_V(V_duration), ',',
        #
        #       nPVI_C(C_duration), ',',
        #       nPVI_V(V_duration), ',',
        #
        #       )

        AlldeltC.append(deltaC(C_duration))
        AlldeltV.append(deltaV(V_duration))
        all_vc.append(vacroC)
        all_vv.append(vacroV)
        all_rpvic.append(rPVI_c(C_duration))
        all_rpviv.append(rPVI_V(V_duration))
        all_npvic.append(nPVI_C(C_duration))
        all_npviv.append(nPVI_V(V_duration))
        all_pctV.append(pctV)
    deltC = round(np.mean(AlldeltC), 9)
    deltV = round(np.mean(AlldeltV), 9)
    vc = round(np.mean(all_vc), 9)
    vv = round(np.mean(all_vv), 9)
    rpvic = round(np.mean(all_rpvic), 9)
    rpviv = round(np.mean(all_rpviv), 9)
    npvic = round(np.mean(all_npvic), 9)
    npviv = round(np.mean(all_npviv), 9)
    perctV = round(np.mean(all_pctV), 9)

    print(path, ',', perctV, ',', deltC, ',', deltV, ',', vc, ',', vv, ',',
          rpvic, ',', rpviv, ',', npvic, ',', npviv, ',')
Esempio n. 26
0
def metircs(path):
    C_list = []
    V_list = []
    file = open(r'C:\Users\GIGABYTE\Desktop\VC_classification.txt')
    with file as f:
        lines = f.readlines()
    C_list = lines = [
        line.rstrip('\n\t')
        for line in open(r'C:\Users\GIGABYTE\Desktop\VC_classification.txt')
    ]
    file1 = open(r'C:\Users\GIGABYTE\Desktop\All_V.txt')
    with file1 as f1:
        lines1 = f1.readlines()
    V_list = lines1 = [
        line.rstrip('\n\t')
        for line in open(r'C:\Users\GIGABYTE\Desktop\All_V.txt')
    ]  #切分 VC 列表
    print(V_list, C_list)

    # path="E:\coorpus"
    # path_cn = 'F:\SAITCorpus\CN'

    # path = 'F:\SAITCorpus'

    file_list = glob.glob(
        path + r"\*\sent\*.TextGrid")  #glob匹配所有的符合条件的文件,并将以list的形式返回
    file_list = glob.glob(path +
                          r"\sent\*.TextGrid")  #glob匹配所有的符合条件的文件,并将以list的形式返回

    print('filename\t', '%V\t', 'deltaC\t', 'deltaV\t')
    for file in file_list:
        TextGrid = tgt.read_textgrid(
            file, include_empty_intervals=True)  #读取TextGrid文件
        #tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[2])
        tier = TextGrid.get_tier_by_name(TextGrid.get_tier_names()[1])
        tier_name = TextGrid.get_tier_names()  # 获取全部的tier 名字
        start = tier.start_time
        end = tier.end_time
        tier2insert = tgt.IntervalTier(start, end, name='CV')
        TextGrid.insert_tier(tier2insert, 3)
        CV = TextGrid.get_tier_by_name('CV')
        annotation = tier.intervals
        AlldeltC = []
        AlldeltV = []
        C_duration = []
        V_duration = []
        duration_all_C = 0
        duration_all_V = 0
        for i in range(len(annotation)):
            old_name = annotation[i].text
            old_start_time = annotation[i].start_time
            old_end_time = annotation[i].end_time
            duration = old_end_time - old_start_time
            if old_name in C_list:
                new_name = 'C'
            elif old_name in V_list:
                new_name = 'V'
            else:
                new_name = 'none'

            #print(old_name, new_name)
            Interval = tgt.Interval(old_start_time,
                                    old_end_time,
                                    text=new_name)  #interval格式
            #print(old_name, new_name, 'duration=', duration)
            if new_name == 'C':
                C_duration.append(duration)
                duration_all_C = duration_all_C + duration
            elif new_name == 'V':
                V_duration.append(duration)
                duration_all_V += duration
            CV.add_interval(Interval)
        #print(V_duration)
        #print('%V value = ', duration_all_V / (duration_all_V + duration_all_C))
        V = duration_all_V / (duration_all_V + duration_all_C)
        deltaC = statistics.stdev(C_duration)  # 取标准差 使用 python statistic 模块
        deltaV = statistics.stdev(V_duration)  # ditto

        #print(deltaV)
        a = round(V * 100, 2)
        b = round(deltaC * 100, 2)
        c = round(deltaV * 100, 2)
        #print(file, ',', a, ',', b, ',', c)
        AlldeltC.append(deltaC)
        AlldeltV.append(deltaV)

        C = np.mean(AlldeltC)
        V = np.mean(AlldeltV)
        #print(file, '\t', C, '\t', V)
        C_ALL = []
        C_ALL.append(C)
        V_ALL = []
        V_ALL.append(V)

    print(np.mean(C_ALL), np.mean(V_ALL))
    print('all set!')