Ejemplo n.º 1
0
    def self_detection(self, tier):
        """Self-Repetition detection.

        :param tier: (sppasTier)

        """
        # Use the appropriate stop-list
        stop_words = self.fix_stop_list(tier)
        # Create a data structure to detect and store a source/echos
        repetition = SelfRepetition(stop_words)
        # Create output data
        src_tier = sppasTier("SR-Source")
        echo_tier = sppasTier("SR-Echo")

        # Initialization of the indexes to work with tokens
        tok_start, tok_search, tok_end = self.__fix_indexes(tier, 0, 0)

        # Detection is here:
        while tok_start < tok_end:

            # Build an array with the tokens
            tokens = [
                tier[i].serialize_labels()
                for i in range(tok_start, tok_end + 1)
            ]
            speaker = DataSpeaker(tokens)

            # Detect the first self-repetition in these data
            limit = tok_search - tok_start
            repetition.detect(speaker, limit)

            # Save the repetition (if any)
            shift = 1
            if repetition.get_source() is not None:
                sppasSelfRepet.__add_repetition(repetition, tier, tok_start,
                                                src_tier, echo_tier)
                (src_start, src_end) = repetition.get_source()
                shift = src_end + 1

            # Fix indexes for the next search
            tok_start, tok_search, tok_end = self.__fix_indexes(
                tier, tok_start, shift)

        return src_tier, echo_tier
Ejemplo n.º 2
0
    def __convert(self, tier, actions):
        """Normalize all tags of all labels of an annotation.

        """
        tokens_tier = sppasTier("Tokens")
        for i, ann in enumerate(tier):
            self.logfile.print_message(
                (info(1220, "annotations")).format(number=i + 1), indent=1)

            location = ann.get_location().copy()
            labels = list()
            # Normalize all labels of the orthographic transcription
            for label in ann.get_labels():

                tokens = list()
                # Normalize only the best tag because each label of an ortho
                # should only concern 1 tag!
                text = label.get_best()
                # Do not tokenize an empty label, noises, laughter...
                if text.is_speech() is True:
                    try:
                        tokens = self.__normalizer.normalize(
                            text.get_content(), actions)
                    except Exception as e:
                        message = (info(1258, "annotations")).format(i) + \
                                  "{:s}".format(str(e))
                        self.logfile.print_message(message, indent=2)

                elif text.is_silence():
                    # in ortho a silence could be one of "#" or "gpf_".
                    # we normalize!
                    tokens = [SIL_ORTHO]
                else:
                    tokens = [text.get_content()]

                # New in SPPAS 1.9.6.
                #  - The result is a sequence of labels.
                #  - Token variants are stored into alternative tags
                for tok in tokens:
                    if tok.startswith('{') and tok.endswith('}'):
                        tok = tok[1:-1]
                        tags = [sppasTag(p) for p in tok.split('|')]
                    else:
                        tags = sppasTag(tok)
                    labels.append(sppasLabel(tags))

            tokens_tier.create_annotation(location, labels)

        return tokens_tier
Ejemplo n.º 3
0
    def make_stop_words(self, tier):
        """Return a tier indicating if entries are stop-words.

        :param tier: (sppasTier) Time-aligned tokens.

        """
        stp_tier = sppasTier('StopWord')
        for ann in tier:
            token = ann.serialize_labels()
            if token not in symbols.all:
                stp = self._stop_words.is_in(token)
                stp_tier.create_annotation(
                    ann.get_location().copy(),
                    sppasLabel(sppasTag(stp, tag_type="bool"))
                )
        return stp_tier
Ejemplo n.º 4
0
    def anchors_to_tier(anchors):
        """Transform anchors to a sppasTier.

        Anchors are stored in frames. It is converted to seconds (a frame is
        during 10ms).

        :param anchors: (List of Anchor)
        :returns: (sppasTier)

        """
        tier = sppasTier('Momel')
        for anchor in anchors:
            tier.create_annotation(
                sppasLocation(sppasPoint(anchor.x * 0.01, 0.005)),
                sppasLabel(sppasTag(anchor.y, "float")))

        return tier
Ejemplo n.º 5
0
    def convert(self, tier):
        """Phonetize annotations of a tokenized tier.

        :param tier: (Tier) the ortho transcription previously tokenized.
        :returns: (Tier) phonetized tier with name "Phones"

        """
        if tier is None:
            raise IOError('No given tier.')
        if tier.is_empty() is True:
            raise EmptyInputError(name=tier.get_name())

        phones_tier = sppasTier("Phones")
        for i, ann in enumerate(tier):
            self.logfile.print_message(
                (info(1220, "annotations")).format(number=i + 1), indent=1)

            location = ann.get_location().copy()
            labels = list()

            # Normalize all labels of the orthographic transcription
            for label in ann.get_labels():

                phonetizations = list()
                for text, score in label:
                    if text.is_pause() or text.is_silence():
                        # It's in case the pronunciation dictionary
                        # were not properly fixed.
                        phonetizations.append(SIL)

                    elif text.is_empty() is False:
                        phones = self._phonetize(text.get_content())
                        for p in phones:
                            phonetizations.extend(p.split(separators.variants))

                # New in SPPAS 1.9.6.
                #  - The result is a sequence of labels.
                #  - Variants are alternative tags.
                tags = [sppasTag(p) for p in set(phonetizations)]
                labels.append(sppasLabel(tags))

            phones_tier.create_annotation(location, labels)

        return phones_tier
Ejemplo n.º 6
0
    def make_classes(self, syllables):
        """Create the tier with syllable classes.

        :param syllables: (sppasTier)

        """
        classes = sppasTier("SyllClassAlign")
        classes.set_meta('syllabification_classes_of_tier',
                         syllables.get_name())

        for syll in syllables:
            location = syll.get_location().copy()
            syll_tag = syll.get_best_tag()
            class_tag = sppasTag(
                self.__syllabifier.classes_phonetized(
                    syll_tag.get_typed_content()))
            classes.create_annotation(location, sppasLabel(class_tag))

        return classes
Ejemplo n.º 7
0
    def make_word_strain(self, tier):
        """Return a tier with modified tokens.

        :param tier: (sppasTier) Time-aligned tokens.

        """
        if len(self._word_strain) == 0:
            return tier

        self.logfile.print_message("Words strain enabled.", indent=1, status=2)
        lems_tier = sppasTier('TokenStrain')
        for ann in tier:
            token = ann.serialize_labels()
            lem = self._word_strain.get(token, token)
            lems_tier.create_annotation(
                ann.get_location().copy(),
                sppasLabel(sppasTag(lem))
            )
        return lems_tier
Ejemplo n.º 8
0
    def convert(self, phonemes, intervals=None):
        """Syllabify labels of a time-aligned phones tier.

        :param phonemes: (sppasTier) time-aligned phonemes tier
        :param intervals: (sppasTier)
        :returns: (sppasTier)

        """
        if intervals is None:
            intervals = sppasSyll._phon_to_intervals(phonemes)

        syllables = sppasTier("SyllAlign")
        syllables.set_meta('syllabification_of_tier', phonemes.get_name())

        for interval in intervals:

            # get the index of the phonemes containing the begin
            # of the interval
            start_phon_idx = phonemes.lindex(
                interval.get_lowest_localization())
            if start_phon_idx == -1:
                start_phon_idx = phonemes.mindex(
                    interval.get_lowest_localization(), bound=-1)

            # get the index of the phonemes containing the end of the interval
            end_phon_idx = phonemes.rindex(interval.get_highest_localization())
            if end_phon_idx == -1:
                end_phon_idx = phonemes.mindex(
                    interval.get_highest_localization(), bound=1)

            # syllabify within the interval
            if start_phon_idx != -1 and end_phon_idx != -1:
                self.syllabify_interval(phonemes, start_phon_idx, end_phon_idx,
                                        syllables)
            else:
                self.logfile.print_message(
                    (info(1224, "annotations")).format(interval),
                    indent=2,
                    status=annots.warning)

        return syllables
Ejemplo n.º 9
0
    def tones_to_tier(tones, anchors_tier):
        """Convert the INTSINT result into a tier.

        :param tones: (list)
        :param anchors_tier: (sppasTier)

        """
        if len(tones) != len(anchors_tier):
            raise AnnDataEqError("tones:" + str(len(tones)),
                                 "anchors:" + str(len(anchors_tier)))

        tier = sppasTier("INTSINT")
        for tone, anchor_ann in zip(tones, anchors_tier):
            # Create the label
            tag = sppasTag(tone)
            # Create the location
            location = anchor_ann.get_location().copy()
            # Create the annotation
            tier.create_annotation(location, sppasLabel(tag))

        return tier
Ejemplo n.º 10
0
    def other_detection(self, inputtier1, inputtier2):
        """Other-Repetition detection.

        :param inputtier1: (Tier)
        :param inputtier2: (Tier)

        """
        inputtier1.set_radius(0.04)
        inputtier2.set_radius(0.04)
        # Use the appropriate stop-list: add un-relevant tokens of the echoing speaker
        stop_words = self.fix_stop_list(inputtier2)
        # Create repeat objects
        repetition = OtherRepetition(stop_words)
        # Create output data
        src_tier = sppasTier("OR-Source")
        echo_tier = sppasTier("OR-Echo")

        # Initialization of tok_start, and tok_end
        tok_start_src = 0
        tok_end_src = min(20,
                          len(inputtier1) -
                          1)  # 20 is the max nb of tokens in a src
        tok_start_echo = 0

        tokens2 = list()
        speaker2 = DataSpeaker(tokens2)
        # Detection is here:
        # detect() is applied work by word, from tok_start to tok_end
        while tok_start_src < tok_end_src:

            # Build an array with the tokens
            tokens1 = [
                inputtier1[i].serialize_labels()
                for i in range(tok_start_src, tok_end_src + 1)
            ]
            speaker1 = DataSpeaker(tokens1)

            # Create speaker2
            # re-create only if different of the previous step...
            src_begin = inputtier1[tok_start_src].get_lowest_localization(
            ).get_midpoint()
            echo_begin = inputtier2[tok_start_echo].get_lowest_localization(
            ).get_midpoint()
            if len(tokens2) == 0 or echo_begin < src_begin:
                tokens2 = list()
                nb_breaks = 0
                old_tok_start_echo = tok_start_echo

                for i in range(old_tok_start_echo, len(inputtier2)):
                    ann = inputtier2[i]
                    label = ann.serialize_labels()
                    if ann.get_lowest_localization().get_midpoint(
                    ) >= src_begin:
                        if tok_start_echo == old_tok_start_echo:
                            tok_start_echo = i
                        if label == SIL_ORTHO:
                            nb_breaks += 1
                        if nb_breaks == self._options['span']:
                            break
                        tokens2.append(label)
                speaker2 = DataSpeaker(tokens2)

            # We can't go too further due to the required time-alignment of
            # tokens between src/echo
            # Check only if the first token is the first token of a source!!
            repetition.detect(speaker1, speaker2, 1)

            # Save repeats
            shift = 1
            if repetition.get_source() is not None:
                s, e = repetition.get_source()
                saved = sppasOtherRepet.__add_repetition(
                    repetition, inputtier1, inputtier2, tok_start_src,
                    tok_start_echo, src_tier, echo_tier)
                if saved is True:
                    shift = e + 1

            tok_start_src = min(tok_start_src + shift, len(inputtier1) - 1)
            tok_end_src = min(tok_start_src + 20, len(inputtier1) - 1)

        return src_tier, echo_tier
Ejemplo n.º 11
0
                    if i == 0:
                        to_merge_anns[h] = hyp_anns
                    else:
                        to_merge_anns[h] = None
                result_ann.set_labels(sppasLabel(sppasTag('Merge')))

        nb_ref_perfect_match_total += nb_ref_perfect_match
        nb_ref_not_match_total += nb_ref_not_match
        nb_ref_several_match_total += nb_ref_several_match

        # Search for situation b in hyp to merge such IPUs
        # -------------------------------------------------
        
        nb_hyp_merge_ipus = 0
        if len(to_merge_anns) > 0:
            a_hyp_tier = sppasTier(hyp_tier.get_name())
            for hyp_ann in hyp_tier:
                if hyp_ann in to_merge_anns:
                    anns_to_merge = to_merge_anns[hyp_ann]
                    if anns_to_merge is not None:
                        # a. merge ipus (hyp5)
                        #    ref:     #    |         ipu        |    #
                        #    hyp:     #    | ipu    |  #  | ipu  |   #
                        nb_hyp_merge_ipus += len(anns_to_merge) - 1
                        labels = []
                        for h in anns_to_merge:
                            labels.extend(h.get_labels())
                        labels.append(sppasLabel(sppasTag('Merged')))
                        a = a_hyp_tier.create_annotation(
                            sppasLocation(
                                sppasInterval(