コード例 #1
0
    def remove_dloc(cls, morph_tier):
        """Remove dloc|dloc=DISLOC words on the morphology tier.

        dloc|dloc=DISLOC stands for `„` on the utterance.
        """
        morph_tier = morph_tier.replace('dloc|dloc=DISLOC', '')
        return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
コード例 #2
0
ファイル: cleaner.py プロジェクト: acqdiv/acqdiv
    def remove_terminator(cls, utterance):
        """Remove utterance terminator.

        Also removes the colon and the dash.
        """
        utterance = CHATUtteranceCleaner.remove_terminator(utterance)
        return utterance.rstrip('-').rstrip(':')
コード例 #3
0
 def test_remove_omissions_multiple_omissions_no_space_before_terminator(
         self):
     """Test with 3 omissions and no space before terminator."""
     actual_output = CHATUtteranceCleaner.remove_omissions(
         '0but where 0is my 0truck?')
     desired_output = 'where my?'
     self.assertEqual(actual_output, desired_output)
コード例 #4
0
    def remove_non_words(cls, morph_tier):
        """Remove all non-words from the morphology tier.

        Non-words have the POS tag 'tag'.
        """
        non_words_regex = re.compile(r'tag\|\S+')
        morph_tier = non_words_regex.sub('', morph_tier)
        return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
コード例 #5
0
    def add_repetitions(cls, raw_utt, morph_tier):
        """Add repetitions to morphology tier."""
        # check if there are any repetitions
        if '[x ' in raw_utt:

            # execute same cleaning steps except those for scoped symbols
            for cleaning_method in [
                    CHATUtteranceCleaner.remove_terminator,
                    CHATUtteranceCleaner.unify_untranscribed,
                    CHATUtteranceCleaner.remove_events,
                    CHATUtteranceCleaner.remove_omissions,
                    CHATUtteranceCleaner.remove_linkers,
                    CHATUtteranceCleaner.remove_separators,
                    CHATUtteranceCleaner.remove_ca,
                    CHATUtteranceCleaner.remove_pauses_between_words,
                    CHATUtteranceCleaner.remove_commas,
                    CHATUtteranceCleaner.null_event_utterances
            ]:
                raw_utt = cleaning_method(raw_utt)

            # remove scoped symbols except for repetitions
            scope_regex = re.compile(r'\[[^x].*?\]')
            raw_utt = scope_regex.sub('', raw_utt)
            raw_utt = CHATUtteranceCleaner.remove_redundant_whitespaces(
                raw_utt)

            # get words from utterance and morphology tier
            utt_words = re.split(r'(?<!\[x) (?!\[x)', raw_utt)
            morph_words = morph_tier.split(' ')

            # check for misalignments
            if len(utt_words) == len(morph_words):

                morph_new = []
                group = []
                for uw, mw in zip(utt_words, morph_words):

                    morph_new.append(mw)
                    match = re.search(r'\[x (\d+)', uw)

                    if uw.startswith('<'):
                        group = [mw]
                    elif match:
                        reps = int(match.group(1))
                        if group:
                            group.append(mw)
                            morph_new += (reps - 1) * group
                            group = []
                        else:
                            morph_new += (reps - 1) * [mw]
                    elif group:
                        group.append(mw)

                return ' '.join(morph_new)

        return morph_tier
コード例 #6
0
    def remove_non_words(cls, morph_tier):
        """Remove all non-words from the morphology tier.

        Non-words include:
            end|end („)
            cm|cm (,)
            bq|bq (“)
            eq|eq (”)
        """
        non_words_regex = re.compile(r'end\|end'
                                     r'|cm\|cm'
                                     r'|bq\|bq'
                                     r'|eq\|eq')

        morph_tier = non_words_regex.sub('', morph_tier)
        return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
コード例 #7
0
 def test_remove_terminator_exclamation_point(self):
     """Test remove_terminator with exclamation_point."""
     actual_output = CHATUtteranceCleaner.remove_terminator('sit down !')
     desired_output = 'sit down'
     self.assertEqual(actual_output, desired_output)
コード例 #8
0
 def test_remove_terminator_question_mark(self):
     """Test remove_terminator with question mark."""
     actual_output = CHATUtteranceCleaner.remove_terminator(
         'is that a carrot ?')
     desired_output = 'is that a carrot'
     self.assertEqual(actual_output, desired_output)
コード例 #9
0
 def test_remove_redundant_whitespace_leading_trailing_whitespace(self):
     """Test remove_redundant_whitespace with lead/end spaces."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         ' h ')
     desired_output = 'h'
     self.assertEqual(actual_output, desired_output)
コード例 #10
0
 def test_remove_pauses_between_words_multiple_pauses(self):
     """Test remove_pauses with 3 pauses of different length."""
     actual_output = CHATUtteranceCleaner.remove_pauses_between_words(
         "I (.) don't (..) know (...) this.")
     desired_output = "I don't know this."
     self.assertEqual(actual_output, desired_output)
コード例 #11
0
 def test_remove_terminator_period(self):
     """Test remove_terminator with period."""
     actual_output = CHATUtteranceCleaner.remove_terminator('I got cold .')
     desired_output = 'I got cold'
     self.assertEqual(actual_output, desired_output)
コード例 #12
0
ファイル: cleaner.py プロジェクト: acqdiv/acqdiv
 def remove_double_hashes(cls, morph_tier):
     """Remove ## from the morphology tier."""
     morph_tier = re.sub(r'(^| )##( |$)', r'\1\2', morph_tier)
     return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
コード例 #13
0
 def clean_morph_tier(cls, morph_tier):
     return CHATUtteranceCleaner.remove_terminator(morph_tier)
コード例 #14
0
 def test_remove_scoped_symbols_empty_string(self):
     """Test remove_scoped_symbols with an empty string."""
     actual_output = CHATUtteranceCleaner.remove_scoped_symbols('')
     desired_output = ''
     self.assertEqual(actual_output, desired_output)
コード例 #15
0
 def test_remove_ca_marked_question(self):
     """Test remove_ca with marked question („)."""
     actual_output = CHATUtteranceCleaner.remove_ca('Hey there„ what up no')
     desired_output = 'Hey there what up no'
     self.assertEqual(actual_output, desired_output)
コード例 #16
0
 def test_remove_redundant_whitespace_leading_trailing_creturns(self):
     """remove_redundant_whitespace with lead/end carriage return."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\rh\r')
     desired_output = 'h'
     self.assertEqual(actual_output, desired_output)
コード例 #17
0
 def test_remove_scoped_symbols_two_levels_nested(self):
     """Test remove_scoped_symbols with 2 levels of nestedness."""
     utterance = "<that's mine <she said [=! cries]>> [=! slaps leg]"
     actual_output = CHATUtteranceCleaner.remove_scoped_symbols(utterance)
     desired_output = "that's mine she said"
     self.assertEqual(actual_output, desired_output)
コード例 #18
0
 def test_remove_scoped_symbols_one_level_nested(self):
     """Test remove_scoped_symbols with 1 level of nestedness."""
     utterance = "<that's mine [=! cries]>"
     actual_output = CHATUtteranceCleaner.remove_scoped_symbols(utterance)
     desired_output = "that's mine"
     self.assertEqual(actual_output, desired_output)
コード例 #19
0
 def test_remove_scoped_symbols_not_nested(self):
     """Test remove_scoped_symbols with 2 not nested symbol pairs."""
     utterance = "<that's mine> [=! cries]"
     actual_output = CHATUtteranceCleaner.remove_scoped_symbols(utterance)
     desired_output = "that's mine"
     self.assertEqual(actual_output, desired_output)
コード例 #20
0
 def test_remove_pauses_between_words_empty_string(self):
     """Test remove_pauses with an empty string."""
     actual_output = CHATUtteranceCleaner.remove_pauses_between_words('')
     desired_output = ''
     self.assertEqual(actual_output, desired_output)
コード例 #21
0
 def test_remove_terminator_trailing_off(self):
     """Test remove_terminator with trailing off."""
     actual_output = CHATUtteranceCleaner.remove_terminator(
         '*SAR: smells good enough for +...')
     desired_output = '*SAR: smells good enough for'
     self.assertEqual(actual_output, desired_output)
コード例 #22
0
 def test_remove_ca_falling_rising_mark(self):
     """Test remove_ca with 3 rising (↑) and 1 falling (↓) mark."""
     actual_output = CHATUtteranceCleaner.remove_ca(
         'Hey↑ there↓ what up↑ no↑')
     desired_output = 'Hey there what up no'
     self.assertEqual(actual_output, desired_output)
コード例 #23
0
 def test_remove_ca_quotations(self):
     """Test remove_ca with opening and closing quotations."""
     actual_output = CHATUtteranceCleaner.remove_ca(
         '“Hey there what up no”')
     desired_output = 'Hey there what up no'
     self.assertEqual(actual_output, desired_output)
コード例 #24
0
 def test_remove_redundant_whitespace_combinations(self):
     """Test remove_redundant_whitespace with mixed space chars."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\n\t \r\r h   \nh \t\t\n\r')
     desired_output = 'h h'
     self.assertEqual(actual_output, desired_output)
コード例 #25
0
ファイル: cleaner.py プロジェクト: acqdiv/acqdiv
 def clean_morpheme_word(cls, morpheme_word):
     return CHATUtteranceCleaner.remove_terminator(morpheme_word)
コード例 #26
0
 def test_remove_redundant_whitespace_empty_string(self):
     """Test remove_redundant_whitespace with empty string."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces('')
     desired_output = ''
     self.assertEqual(actual_output, desired_output)
コード例 #27
0
 def test_remove_scoped_symbols_no_withespace(self):
     """Test remove_scoped_symbols with missing whitespace."""
     utterance = '0[=! just testing something]'
     actual_output = CHATUtteranceCleaner.remove_scoped_symbols(utterance)
     desired_output = '0'
     self.assertEqual(actual_output, desired_output)
コード例 #28
0
 def test_remove_ca_satellite_marker(self):
     """Test remove_ca with satellite marker (‡)."""
     actual_output = CHATUtteranceCleaner.remove_ca('no ‡ Mommy no go')
     desired_output = 'no Mommy no go'
     self.assertEqual(actual_output, desired_output)
コード例 #29
0
 def clean_morph_tier(cls, morph_tier):
     morph_tier = CHATUtteranceCleaner.remove_terminator(morph_tier)
     return cls.remove_non_words(morph_tier)
コード例 #30
0
 def test_remove_redundant_whitespace_multiple_creturns(self):
     """remove_redundant_whitespace with multiple carriage return."""
     actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces(
         '\r\rh\r\rh\r\r')
     desired_output = 'h h'
     self.assertEqual(actual_output, desired_output)