def test_clean_string(self): input_string = " legitimate-.,;:_·<>+\\|/'#@()\"\t\n\r!%&=?¡¿ text " expected_string = "legitimate text" p = Processing() output_string = Tools.clean_string(input_string, StringProcessor.unwanted_chars) output_string = Tools.clean_spaces(output_string) self.assertEqual(expected_string, output_string)
def process(self, input_string): # Transliterate to ascii final_str = self.trans.to_ascii(input_string) # Clean useless chars and trim spaces: final_str = Tools.clean_string(final_str, StringProcessor.unwanted_chars) final_str = Tools.clean_spaces(final_str) # Uppercase the string final_str = final_str.upper() return final_str
def process(self, input_string): # Trim spaces and remove unwanted chars: final_str = Tools.clean_string(input_string, NumberProcessor.unwanted_chars) final_str = Tools.clean_spaces(final_str) try: # Parse the number to the most general one (float) number = float(final_str) # Turn the number to a string again (to homogenise representation) final_str = str(number) except ValueError: logging.warning("Number string could not be parsed: " + input_string + " (original)" + final_str + " (after cleaning)") # If number cannot be parsed, return original string final_str = input_string return final_str
def process(self, input_string): # Trim spaces and remove unwanted chars: final_str = Tools.clean_string(input_string, DateProcessor.unwanted_chars) final_str = Tools.clean_spaces(final_str) try: # Parse the date with multiple approaches date_object = dateparser.parse(final_str) if date_object is None: raise TypeError # Turn the date to a string again (use ISO format) final_str = str(date_object.date().isoformat()) except TypeError: logging.warning("Date string could not be parsed: <" + input_string + "> (original) - <" + final_str + "> (after cleaning)") # If date cannot be parsed, return original string final_str = input_string return final_str