Beispiel #1
0
    def validate(self, schema=None):
        """
        Validate that we have a valid object.

        On error, this will raise a `ScrapeValueError`

        This also expects that the schemas assume that omitting required
        in the schema asserts the field is optional, not required. This is
        due to upstream schemas being in JSON Schema v3, and not validictory's
        modified syntax.
        ^ TODO: FIXME
        """
        if schema is None:
            schema = self._schema

        type_checker = Draft3Validator.TYPE_CHECKER.redefine(
            "datetime",
            lambda c, d: isinstance(d, (datetime.date, datetime.datetime)))
        type_checker = type_checker.redefine(
            "date", lambda c, d:
            (isinstance(d, datetime.date) and not isinstance(
                d, datetime.datetime)))

        ValidatorCls = jsonschema.validators.extend(Draft3Validator,
                                                    type_checker=type_checker)
        validator = ValidatorCls(schema, format_checker=FormatChecker())

        errors = [
            str(error) for error in validator.iter_errors(self.as_dict())
        ]
        if errors:
            raise ScrapeValueError('validation of {} {} failed: {}'.format(
                self.__class__.__name__, self._id,
                '\n\t' + '\n\t'.join(errors)))
Beispiel #2
0
    def validate(self, schema=None):
        """
        Validate that we have a valid object.

        On error, this will raise a `ScrapeValueError`

        This also expects that the schemas assume that omitting required
        in the schema asserts the field is optional, not required. This is
        due to upstream schemas being in JSON Schema v3, and not validictory's
        modified syntax.
        ^ TODO: FIXME
        """
        if schema is None:
            schema = self._schema
        validator = Draft3Validator(
            schema,
            types={'datetime': (datetime.date, datetime.datetime)},
            format_checker=FormatChecker())
        errors = [
            str(error) for error in validator.iter_errors(self.as_dict())
        ]
        if errors:
            raise ScrapeValueError('validation of {} {} failed: {}'.format(
                self.__class__.__name__, self._id,
                '\n\t' + '\n\t'.join(errors)))
Beispiel #3
0
    def __init__(self,
                 *,
                 motion_text,
                 start_date,
                 classification,
                 result,
                 legislative_session=None,
                 identifier='',
                 bill=None,
                 bill_chamber=None,
                 bill_action=None,
                 organization=None,
                 chamber=None):
        super(VoteEvent, self).__init__()

        self.legislative_session = legislative_session
        self.motion_text = motion_text
        self.motion_classification = cleanup_list(classification, [])
        self.start_date = start_date
        self.result = result
        self.identifier = identifier
        self.bill_action = bill_action

        self.set_bill(bill, chamber=bill_chamber)

        if isinstance(bill, Bill) and not self.legislative_session:
            self.legislative_session = bill.legislative_session

        if not self.legislative_session:
            raise ScrapeValueError('must set legislative_session or bill')

        self.organization = pseudo_organization(organization, chamber,
                                                'legislature')
        self.votes = []
        self.counts = []
Beispiel #4
0
    def add_entity(self, name, entity_type, *, id, note):
        ret = {"name": name, "entity_type": entity_type, "note": note}
        if id:
            ret['id'] = id
        elif entity_type:
            if entity_type in ('organization', 'person'):
                id = _make_pseudo_id(name=name)
            elif entity_type in ('bill', 'vote_event'):
                id = _make_pseudo_id(identifier=name)
            else:
                raise ScrapeValueError(
                    'attempt to call add_entity with unsupported '
                    'entity type: {}'.format(entity_type))
            ret[entity_type + '_id'] = id

        self['related_entities'].append(ret)
Beispiel #5
0
 def set_bill(self, bill_or_identifier, *, chamber=None):
     if not bill_or_identifier:
         self.bill = None
     elif isinstance(bill_or_identifier, Bill):
         if chamber:
             raise ScrapeValueError(
                 "set_bill takes no arguments when using a `Bill` object")
         self.bill = bill_or_identifier._id
     else:
         if chamber is None:
             chamber = 'legislature'
         kwargs = {
             'identifier': bill_or_identifier,
             'from_organization__classification': chamber
         }
         self.bill = _make_pseudo_id(**kwargs)
Beispiel #6
0
def pseudo_organization(organization, classification, default=None):
    """ helper for setting an appropriate ID for organizations """
    if organization and classification:
        raise ScrapeValueError('cannot specify both classification and organization')
    elif classification:
        return _make_pseudo_id(classification=classification)
    elif organization:
        if isinstance(organization, Organization):
            return organization._id
        elif isinstance(organization, str):
            return organization
        else:
            return _make_pseudo_id(**organization)
    elif default is not None:
        return _make_pseudo_id(classification=default)
    else:
        return None
Beispiel #7
0
    def validate(self, schema=None):
        """
        Validate that we have a valid object.

        On error, this will raise a `ScrapeValueError`

        This also expects that the schemas assume that omitting required
        in the schema asserts the field is optional, not required. This is
        due to upstream schemas being in JSON Schema v3, and not validictory's
        modified syntax.
        """
        if schema is None:
            schema = self._schema

        validator = utils.DatetimeValidator(required_by_default=False,
                                            fail_fast=False)

        try:
            validator.validate(self.as_dict(), schema)
        except ValidationError as ve:
            raise ScrapeValueError('validation of {} {} failed: {}'.format(
                self.__class__.__name__, self._id, ve))
Beispiel #8
0
    def _add_associated_link(self,
                             collection,
                             note,
                             url,
                             *,
                             media_type,
                             text,
                             on_duplicate,
                             date=''):
        if on_duplicate not in ['error', 'ignore']:
            raise ScrapeValueError("on_duplicate must be 'error' or 'ignore'")

        try:
            associated = getattr(self, collection)
        except AttributeError:
            associated = self[collection]

        ver = {'note': note, 'links': [], 'date': date}

        # keep a list of the links we've seen, we need to iterate over whole list on each add
        # unfortunately this means adds are O(n)
        seen_links = set()

        matches = 0
        for item in associated:
            for link in item['links']:
                seen_links.add(link['url'])

            if all(ver.get(x) == item.get(x) for x in ["note", "date"]):
                matches = matches + 1
                ver = item

        # it should be impossible to have multiple matches found unless someone is bypassing
        # _add_associated_link
        assert matches <= 1, "multiple matches found in _add_associated_link"

        if url in seen_links:
            if on_duplicate == 'error':
                raise ScrapeValueError("Duplicate entry in '%s' - URL: '%s'" %
                                       (collection, url))
            else:
                # This means we're in ignore mode. This situation right here
                # means we should *skip* adding this link silently and continue
                # on with our scrape. This should *ONLY* be used when there's
                # a site issue (Version 1 == Version 2 because of a bug) and
                # *NEVER* because "Current" happens to match "Version 3". Fix
                # that in the scraper, please.
                #  - PRT
                return None

        # OK. This is either new or old. Let's just go for it.
        ret = {'url': url, 'media_type': media_type, 'text': text}

        ver['links'].append(ret)

        if matches == 0:
            # in the event we've got a new entry; let's just insert it into
            # the versions on this object. Otherwise it'll get thrown in
            # automagically.
            associated.append(ver)

        return ver
Beispiel #9
0
 def __setattr__(self, key, val):
     if key[0] != '_' and key not in self._schema['properties'].keys():
         raise ScrapeValueError('property "{}" not in {} schema'.format(
             key, self._type))
     super(BaseModel, self).__setattr__(key, val)