Exemple #1
0
 def test_clean_string(self):
     input_string = "  legitimate-.,;:_·<>+\\|/'#@()\"\t\n\r!%&=?¡¿    text "
     expected_string = "legitimate text"
     p = Processing()
     output_string = Tools.clean_string(input_string,
                                        StringProcessor.unwanted_chars)
     output_string = Tools.clean_spaces(output_string)
     self.assertEqual(expected_string, output_string)
    def process(self, input_string):
        # Transliterate to ascii
        final_str = self.trans.to_ascii(input_string)

        # Clean useless chars and trim spaces:
        final_str = Tools.clean_string(final_str,
                                       StringProcessor.unwanted_chars)
        final_str = Tools.clean_spaces(final_str)

        # Uppercase the string
        final_str = final_str.upper()
        return final_str
Exemple #3
0
    def process(self, input_string):

        # Trim spaces and remove unwanted chars:
        final_str = Tools.clean_string(input_string,
                                       NumberProcessor.unwanted_chars)
        final_str = Tools.clean_spaces(final_str)

        try:
            # Parse the number to the most general one (float)
            number = float(final_str)

            # Turn the number to a string again (to homogenise representation)
            final_str = str(number)
        except ValueError:
            logging.warning("Number string could not be parsed: " +
                            input_string + " (original)" + final_str +
                            " (after cleaning)")
            # If number cannot be parsed, return original string
            final_str = input_string

        return final_str
Exemple #4
0
    def process(self, input_string):

        # Trim spaces and remove unwanted chars:
        final_str = Tools.clean_string(input_string,
                                       DateProcessor.unwanted_chars)
        final_str = Tools.clean_spaces(final_str)

        try:
            # Parse the date with multiple approaches
            date_object = dateparser.parse(final_str)
            if date_object is None:
                raise TypeError

            # Turn the date to a string again (use ISO format)
            final_str = str(date_object.date().isoformat())
        except TypeError:
            logging.warning("Date string could not be parsed: <" +
                            input_string + "> (original) - <" + final_str +
                            "> (after cleaning)")
            # If date cannot be parsed, return original string
            final_str = input_string

        return final_str