def remove_dloc(cls, morph_tier): """Remove dloc|dloc=DISLOC words on the morphology tier. dloc|dloc=DISLOC stands for `„` on the utterance. """ morph_tier = morph_tier.replace('dloc|dloc=DISLOC', '') return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
def remove_non_words(cls, morph_tier): """Remove all non-words from the morphology tier. Non-words have the POS tag 'tag'. """ non_words_regex = re.compile(r'tag\|\S+') morph_tier = non_words_regex.sub('', morph_tier) return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
def add_repetitions(cls, raw_utt, morph_tier): """Add repetitions to morphology tier.""" # check if there are any repetitions if '[x ' in raw_utt: # execute same cleaning steps except those for scoped symbols for cleaning_method in [ CHATUtteranceCleaner.remove_terminator, CHATUtteranceCleaner.unify_untranscribed, CHATUtteranceCleaner.remove_events, CHATUtteranceCleaner.remove_omissions, CHATUtteranceCleaner.remove_linkers, CHATUtteranceCleaner.remove_separators, CHATUtteranceCleaner.remove_ca, CHATUtteranceCleaner.remove_pauses_between_words, CHATUtteranceCleaner.remove_commas, CHATUtteranceCleaner.null_event_utterances ]: raw_utt = cleaning_method(raw_utt) # remove scoped symbols except for repetitions scope_regex = re.compile(r'\[[^x].*?\]') raw_utt = scope_regex.sub('', raw_utt) raw_utt = CHATUtteranceCleaner.remove_redundant_whitespaces( raw_utt) # get words from utterance and morphology tier utt_words = re.split(r'(?<!\[x) (?!\[x)', raw_utt) morph_words = morph_tier.split(' ') # check for misalignments if len(utt_words) == len(morph_words): morph_new = [] group = [] for uw, mw in zip(utt_words, morph_words): morph_new.append(mw) match = re.search(r'\[x (\d+)', uw) if uw.startswith('<'): group = [mw] elif match: reps = int(match.group(1)) if group: group.append(mw) morph_new += (reps - 1) * group group = [] else: morph_new += (reps - 1) * [mw] elif group: group.append(mw) return ' '.join(morph_new) return morph_tier
def remove_non_words(cls, morph_tier): """Remove all non-words from the morphology tier. Non-words include: end|end („) cm|cm (,) bq|bq (“) eq|eq (”) """ non_words_regex = re.compile(r'end\|end' r'|cm\|cm' r'|bq\|bq' r'|eq\|eq') morph_tier = non_words_regex.sub('', morph_tier) return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
def remove_double_hashes(cls, morph_tier): """Remove ## from the morphology tier.""" morph_tier = re.sub(r'(^| )##( |$)', r'\1\2', morph_tier) return CHATUtteranceCleaner.remove_redundant_whitespaces(morph_tier)
def test_remove_redundant_whitespace_leading_trailing_whitespace(self): """Test remove_redundant_whitespace with lead/end spaces.""" actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces( ' h ') desired_output = 'h' self.assertEqual(actual_output, desired_output)
def test_remove_redundant_whitespace_empty_string(self): """Test remove_redundant_whitespace with empty string.""" actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces('') desired_output = '' self.assertEqual(actual_output, desired_output)
def test_remove_redundant_whitespace_combinations(self): """Test remove_redundant_whitespace with mixed space chars.""" actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces( '\n\t \r\r h \nh \t\t\n\r') desired_output = 'h h' self.assertEqual(actual_output, desired_output)
def test_remove_redundant_whitespace_multiple_creturns(self): """remove_redundant_whitespace with multiple carriage return.""" actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces( '\r\rh\r\rh\r\r') desired_output = 'h h' self.assertEqual(actual_output, desired_output)
def test_remove_redundant_whitespace_leading_trailing_creturns(self): """remove_redundant_whitespace with lead/end carriage return.""" actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces( '\rh\r') desired_output = 'h' self.assertEqual(actual_output, desired_output)
def test_remove_redundant_whitespace_multiple_newlines(self): """Test remove_redundant_whitespace with multiple newlines.""" actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces( '\n\nh\n\nh\n\n') desired_output = 'h h' self.assertEqual(actual_output, desired_output)
def test_remove_redundant_whitespace_multiple_tabs(self): """Test remove_redundant_whitespace for multiple tabs.""" actual_output = CHATUtteranceCleaner.remove_redundant_whitespaces( '\t\th\t\th\t\t') desired_output = 'h h' self.assertEqual(actual_output, desired_output)
def remove_timestamp(cls, translation): """Remove timestamps in the Sesotho translation tier.""" translation = re.sub(r'[0-9]+_[0-9]+', '', translation) return CHATUtteranceCleaner.remove_redundant_whitespaces(translation)