Ejemplo n.º 1
0
    def remove_dloc(cls, morph_tier):
        """Remove dloc|dloc=DISLOC words on the morphology tier.

        dloc|dloc=DISLOC stands for `„` on the utterance.
        """
        morph_tier = morph_tier.replace('dloc|dloc=DISLOC', '')
        return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
Ejemplo n.º 2
0
    def remove_non_words(cls, morph_tier):
        """Remove all non-words from the morphology tier.

        Non-words have the POS tag 'tag'.
        """
        non_words_regex = re.compile(r'tag\|\S+')
        morph_tier = non_words_regex.sub('', morph_tier)
        return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
Ejemplo n.º 3
0
    def add_repetitions(cls, raw_utt, morph_tier):
        """Add repetitions to morphology tier."""
        # check if there are any repetitions
        if '[x ' in raw_utt:

            # execute same cleaning steps except those for scoped symbols
            for cleaning_method in [
                    CHATUtteranceCleaner.remove_terminator,
                    CHATUtteranceCleaner.unify_untranscribed,
                    CHATUtteranceCleaner.remove_events,
                    CHATUtteranceCleaner.remove_omissions,
                    CHATUtteranceCleaner.remove_linkers,
                    CHATUtteranceCleaner.remove_separators,
                    CHATUtteranceCleaner.remove_ca,
                    CHATUtteranceCleaner.remove_pauses_between_words,
                    CHATUtteranceCleaner.remove_commas,
                    CHATUtteranceCleaner.null_event_utterances
            ]:
                raw_utt = cleaning_method(raw_utt)

            # remove scoped symbols except for repetitions
            scope_regex = re.compile(r'\[[^x].*?\]')
            raw_utt = scope_regex.sub('', raw_utt)
            raw_utt = CHATUtteranceCleaner.remove_redundant_whitespaces(
                raw_utt)

            # get words from utterance and morphology tier
            utt_words = re.split(r'(?<!\[x) (?!\[x)', raw_utt)
            morph_words = morph_tier.split(' ')

            # check for misalignments
            if len(utt_words) == len(morph_words):

                morph_new = []
                group = []
                for uw, mw in zip(utt_words, morph_words):

                    morph_new.append(mw)
                    match = re.search(r'\[x (\d+)', uw)

                    if uw.startswith('<'):
                        group = [mw]
                    elif match:
                        reps = int(match.group(1))
                        if group:
                            group.append(mw)
                            morph_new += (reps - 1) * group
                            group = []
                        else:
                            morph_new += (reps - 1) * [mw]
                    elif group:
                        group.append(mw)

                return ' '.join(morph_new)

        return morph_tier
Ejemplo n.º 4
0
    def remove_non_words(cls, morph_tier):
        """Remove all non-words from the morphology tier.

        Non-words include:
            end|end („)
            cm|cm (,)
            bq|bq (“)
            eq|eq (”)
        """
        non_words_regex = re.compile(r'end\|end'
                                     r'|cm\|cm'
                                     r'|bq\|bq'
                                     r'|eq\|eq')

        morph_tier = non_words_regex.sub('', morph_tier)
        return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
Ejemplo n.º 5
0
 def remove_double_hashes(cls, morph_tier):
     """Remove ## from the morphology tier."""
     morph_tier = re.sub(r'(^| )##( |$)', r'\1\2', morph_tier)
     return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
Ejemplo n.º 6
0
 def test_remove_redundant_whitespace_leading_trailing_whitespace(self):
     """Test remove_redundant_whitespace with lead/end spaces."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         ' h ')
     desired_output = 'h'
     self.assertEqual(actual_output, desired_output)
Ejemplo n.º 7
0
 def test_remove_redundant_whitespace_empty_string(self):
     """Test remove_redundant_whitespace with empty string."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces('')
     desired_output = ''
     self.assertEqual(actual_output, desired_output)
Ejemplo n.º 8
0
 def test_remove_redundant_whitespace_combinations(self):
     """Test remove_redundant_whitespace with mixed space chars."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\n\t \r\r h   \nh \t\t\n\r')
     desired_output = 'h h'
     self.assertEqual(actual_output, desired_output)
Ejemplo n.º 9
0
 def test_remove_redundant_whitespace_multiple_creturns(self):
     """remove_redundant_whitespace with multiple carriage return."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\r\rh\r\rh\r\r')
     desired_output = 'h h'
     self.assertEqual(actual_output, desired_output)
Ejemplo n.º 10
0
 def test_remove_redundant_whitespace_leading_trailing_creturns(self):
     """remove_redundant_whitespace with lead/end carriage return."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\rh\r')
     desired_output = 'h'
     self.assertEqual(actual_output, desired_output)
Ejemplo n.º 11
0
 def test_remove_redundant_whitespace_multiple_newlines(self):
     """Test remove_redundant_whitespace with multiple newlines."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\n\nh\n\nh\n\n')
     desired_output = 'h h'
     self.assertEqual(actual_output, desired_output)
Ejemplo n.º 12
0
 def test_remove_redundant_whitespace_multiple_tabs(self):
     """Test remove_redundant_whitespace for multiple tabs."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\t\th\t\th\t\t')
     desired_output = 'h h'
     self.assertEqual(actual_output, desired_output)
Ejemplo n.º 13
0
 def remove_timestamp(cls, translation):
     """Remove timestamps in the Sesotho translation tier."""
     translation = re.sub(r'[0-9]+_[0-9]+', '', translation)
     return CHATUtteranceCleaner.remove_redundant_whitespaces(translation)