def test_fix_future_year_typo(self):
     correct = str(datetime.date.today().year)
     transposed = correct[0] + correct[2] + correct[1] + correct[3]
     expectations = {
         '12/01/%s' % transposed: '12/01/%s' % correct,  # Here's the fix
         '12/01/%s' % correct: '12/01/%s' % correct,     # Should not change
         '12/01/2806': '12/01/2806',                     # Should not change
         '12/01/2886': '12/01/2886',                     # Should not change
     }
     for before, after in expectations.items():
         fixed_date = fix_future_year_typo(convert_date_string(before))
         self.assertEqual(fixed_date, convert_date_string(after))
Ejemplo n.º 2
0
 def test_fix_future_year_typo(self):
     correct = str(datetime.date.today().year)
     transposed = correct[0] + correct[2] + correct[1] + correct[3]
     expectations = {
         "12/01/%s" % transposed: "12/01/%s" % correct,  # Here's the fix
         "12/01/%s" % correct: "12/01/%s" % correct,  # Should not change
         "12/01/2806": "12/01/2806",  # Should not change
         "12/01/2886": "12/01/2886",  # Should not change
     }
     for before, after in expectations.items():
         fixed_date = fix_future_year_typo(convert_date_string(before))
         with self.subTest("Future years", before=before):
             self.assertEqual(fixed_date, convert_date_string(after))
Ejemplo n.º 3
0
    def _check_sanity(self):
        """Check that the objects attributes make sense:
            1. Do all the attributes have the same length?
            1. Do we have any content at all?
            1. Is there a bare minimum of meta data?
            1. Are the dates datetime objects, not strings?
            1. Are any dates from the 22nd century? (01-01-2104)
            1. Are case_names more than just empty whitespace?
            1. Has the `cookies` attribute been normalized to a dict?
            1. ?

        The signature of this method is subject to change as additional checks
        become convenient.

        Inheriting classes should override this method calling super to give it
        the necessary parameters.

        If sanity is OK, no return value. If not, throw InsanityException or
        warnings, as appropriate.
        """
        lengths = {}
        for attr in self._all_attrs:
            if self.__getattribute__(attr) is not None:
                lengths[attr] = len(self.__getattribute__(attr))
        values = list(lengths.values())
        if values.count(values[0]) != len(values):
            # Are all elements equal?
            raise InsanityException(
                "%s: Scraped meta data fields have differing"
                " lengths: %s" % (self.court_id, lengths)
            )
        if len(self.case_names) == 0:
            logger.warning("%s: Returned with zero items." % self.court_id)
        else:
            for field in self._req_attrs:
                if self.__getattribute__(field) is None:
                    raise InsanityException(
                        "%s: Required fields do not contain any data: %s"
                        % (self.court_id, field)
                    )
            i = 0
            prior_case_name = None
            for name in self.case_names:
                if not name.strip():
                    raise InsanityException(
                        "Item with index %s has an empty case name. The prior "
                        "item had case name of: %s" % (i, prior_case_name)
                    )
                prior_case_name = name
                i += 1

        for index, case_date in enumerate(self.case_dates):
            if not isinstance(case_date, date):
                raise InsanityException(
                    "%s: member of case_dates list not a valid date object. "
                    "Instead it is: %s with value: %s"
                    % (self.court_id, type(case_date), case_date)
                )
            # Sanitize case date, fix typo of current year if present
            fixed_date = fix_future_year_typo(case_date)
            if fixed_date != case_date:
                logger.info(
                    "Date year typo detected. Converting %s to %s "
                    "for case '%s' in %s"
                    % (
                        case_date,
                        fixed_date,
                        self.case_names[index],
                        self.court_id,
                    )
                )
                case_date = fixed_date
                self.case_dates[index] = fixed_date
            if case_date.year > 2025:
                raise InsanityException(
                    "%s: member of case_dates list is from way in the future, "
                    "with value %s" % (self.court_id, case_date.year)
                )

        # Is cookies a dict?
        if type(self.cookies) != dict:
            raise InsanityException(
                "self.cookies not set to be a dict by " "scraper."
            )
        logger.info(
            "%s: Successfully found %s items."
            % (self.court_id, len(self.case_names))
        )
Ejemplo n.º 4
0
    def _check_sanity(self):
        """Check that the objects attributes make sense:
            1. Do all the attributes have the same length?
            1. Do we have any content at all?
            1. Is there a bare minimum of meta data?
            1. Are the dates datetime objects, not strings?
            1. Are any dates from the 22nd century? (01-01-2104)
            1. Are case_names more than just empty whitespace?
            1. Has the `cookies` attribute been normalized to a dict?
            1. ?

        The signature of this method is subject to change as additional checks
        become convenient.

        Inheriting classes should override this method calling super to give it
        the necessary parameters.

        If sanity is OK, no return value. If not, throw InsanityException or
        warnings, as appropriate.
        """
        lengths = {}
        for attr in self._all_attrs:
            if self.__getattribute__(attr) is not None:
                lengths[attr] = len(self.__getattribute__(attr))
        values = list(lengths.values())
        if values.count(values[0]) != len(values):
            # Are all elements equal?
            raise InsanityException("%s: Scraped meta data fields have differing"
                                    " lengths: %s" % (self.court_id, lengths))
        if len(self.case_names) == 0:
            logger.warning('%s: Returned with zero items.' % self.court_id)
        else:
            for field in self._req_attrs:
                if self.__getattribute__(field) is None:
                    raise InsanityException('%s: Required fields do not contain any data: %s' % (self.court_id, field))
            i = 0
            prior_case_name = None
            for name in self.case_names:
                if not name.strip():
                    raise InsanityException(
                        "Item with index %s has an empty case name. The prior "
                        "item had case name of: %s" % (i, prior_case_name)
                    )
                prior_case_name = name
                i += 1

        for index, case_date in enumerate(self.case_dates):
            if not isinstance(case_date, date):
                raise InsanityException(
                    '%s: member of case_dates list not a valid date object. '
                    'Instead it is: %s with value: %s' % (
                        self.court_id, type(case_date), case_date)
                )
            # Sanitize case date, fix typo of current year if present
            fixed_date = fix_future_year_typo(case_date)
            if fixed_date != case_date:
                logger.info(
                    "Date year typo detected. Converting %s to %s "
                    "for case '%s' in %s" % (case_date, fixed_date, self.case_names[index], self.court_id)
                )
                case_date = fixed_date
                self.case_dates[index] = fixed_date
            if case_date.year > 2025:
                raise InsanityException(
                    '%s: member of case_dates list is from way in the future, '
                    'with value %s' % (self.court_id, case_date.year)
                )

        # Is cookies a dict?
        if type(self.cookies) != dict:
            raise InsanityException('self.cookies not set to be a dict by '
                                    'scraper.')
        logger.info("%s: Successfully found %s items." % (self.court_id,
                                                          len(self.case_names)))