Ejemplo n.º 1
0
    def scrape(self, window=28) :
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for matter in self.matters(n_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Board of Directors"})
            
            legistar_web = matter['legistar_url']
            
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(body_name,
                                           'organization',
                                           entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill. 
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(identifier=identifier,
                                          legislative_session=related_bill_session,
                                          relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link('Board Report',
                                  'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id),
                                   media_type="application/pdf")

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_document_link(attachment['MatterAttachmentName'],
                                           attachment['MatterAttachmentHyperlink'],
                                           media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Ejemplo n.º 2
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE']
                    and bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            for sponsor in bill_data['SPONSOR_NAMES']:
                stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else
                         'cosponsor')
                if sponsor:
                    bill.add_sponsorship(
                        name=sponsor,
                        entity_type='person',
                        primary=stype == 'primary',
                        classification=stype,
                    )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate' else 'lower')

                date = datetime.datetime.strptime(event['occurred_datetime'],
                                                  "%Y-%m-%dT%H:%M:%S")
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning(
                        'unknown action code on %s: %s %s' %
                        (bill_id, event['action_code'], event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(action,
                                date.strftime('%Y-%m-%d'),
                                chamber=actor,
                                classification=atype)

            try:
                yield from self.scrape_html(bill, session)
            except scrapelib.HTTPError as e:
                self.warning('unable to fetch HTML for bill {0}'.format(
                    bill['bill_id']))

            yield bill
Ejemplo n.º 3
0
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in self.matters(n_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            # Temporarily, we should not scrape or import these bills:
            # https://chicago.legistar.com/LegislationDetail.aspx?ID=3291304&GUID=72ACF5FE-0803-46E8-90B4-604119803293
            # They have duplicate action items, which cause the entire scrape
            # to fail. The Chicago clerk's office should fix it in the near
            # future, after which we can remove this code.
            problem_bills = ['CL2017-1281']
            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = matter['legistar_url']

            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop('responsible person')
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(
                        responsible_person,
                        'person',
                        entity_id=_make_pseudo_id(name=responsible_person))

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 4
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            'B': 'bill',
            'R': 'resolution',
            'JR': 'joint resolution',
            'CR': 'concurrent resolution',
        }

        chamber_map = {
            'H': 'lower',
            'S': 'upper',
            'J': 'joint',
            'E': 'legislature',  # Effective date
        }

        action_code_map = {
            'HI': None,
            'SI': None,
            'HH': None,
            'SH': None,
            'HPF': ['introduction'],
            'HDSAS': None,
            'SPF': ['introduction'],
            'HSR': ['reading-2'],
            'SSR': ['reading-2'],
            'HFR': ['reading-1'],
            'SFR': ['reading-1'],
            'HRECM': ['withdrawal', 'referral-committee'],
            'SRECM': ['withdrawal', 'referral-committee'],
            'SW&C': ['withdrawal', 'referral-committee'],
            'HW&C': ['withdrawal', 'referral-committee'],
            'HRA': ['passage'],
            'SRA': ['passage'],
            'HPA': ['passage'],
            'HRECO': None,
            'SPA': ['passage'],
            'HTABL': None,  # 'House Tabled' - what is this?
            'SDHAS': None,
            'HCFR': ['committee-passage-favorable'],
            'SCFR': ['committee-passage-favorable'],
            'HRAR': ['referral-committee'],
            'SRAR': ['referral-committee'],
            'STR': ['reading-3'],
            'SAHAS': None,
            'SE': ['passage'],
            'SR': ['referral-committee'],
            'HTRL': ['reading-3', 'failure'],
            'HTR': ['reading-3'],
            'S3RLT': ['reading-3', 'failure'],
            'HASAS': None,
            'S3RPP': None,
            'STAB': None,
            'SRECO': None,
            'SAPPT': None,
            'HCA': None,
            'HNOM': None,
            'HTT': None,
            'STT': None,
            'SRECP': None,
            'SCRA': None,
            'SNOM': None,
            'S2R': ['reading-2'],
            'H2R': ['reading-2'],
            'SENG': ['passage'],
            'HENG': ['passage'],
            'HPOST': None,
            'HCAP': None,
            'SDSG': ['executive-signature'],
            'SSG': ['executive-receipt'],
            'Signed Gov': ['executive-signature'],
            'HDSG': ['executive-signature'],
            'HSG': ['executive-receipt'],
            'EFF': None,
            'HRP': None,
            'STH': None,
            'HTS': None,
        }

        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(self.lservice.GetLegislationForSession,
                              sid)['LegislationIndex']

        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]

            actions = reversed([{
                'code': x['Code'],
                'action': x['Description'],
                '_guid': x['Id'],
                'date': x['Date']
            } for x in history])

            guid = instrument['Id']

            # A little bit hacky.
            bill_prefix = instrument['DocumentType']
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = '%s %s' % (
                bill_prefix,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=bill_chamber,
                        title=title,
                        classification=bill_type)
            bill.add_abstract(description, note='description')
            bill.extras = {'guid': guid}

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = VoteEvent(
                        start_date=vote_['Date'].strftime('%Y-%m-%d'),
                        motion_text=vote_['Caption'] or 'Vote on Bill',
                        chamber={
                            'House': 'lower',
                            'Senate': 'upper'
                        }[vote_['Branch']],
                        result='pass'
                        if vote_['Yeas'] > vote_['Nays'] else 'fail',
                        classification='passage',
                        bill=bill,
                    )
                    vote.set_count('yes', vote_['Yeas'])
                    vote.set_count('no', vote_['Nays'])
                    vote.set_count('other',
                                   vote_['Excused'] + vote_['NotVoting'])

                    vote.add_source(self.vsource)

                    methods = {'Yea': 'yes', 'Nay': 'no'}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        vote.vote(methods.get(how, 'other'), whom['Name'])

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        'House': 'lower',
                        'Senate': 'upper',
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                action_chamber = chamber_map[action['code'][0]]

                try:
                    action_types = action_code_map[action['code']]
                except KeyError:
                    error_msg = 'Code {code} for action {action} not recognized.'.format(
                        code=action['code'], action=action['action'])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(
                    ('committee' in x for x in action_types)):
                    committees = [
                        str(x) for x in ccommittees.get(action_chamber, [])
                    ]

                act = bill.add_action(action['action'],
                                      action['date'].strftime('%Y-%m-%d'),
                                      classification=action_types,
                                      chamber=action_chamber)
                for committee in committees:
                    act.add_related_entity(committee, 'organization')
                act.extras = {
                    'code': action['code'],
                    'guid': action['_guid'],
                }

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [(x['Type'], self.get_member(x['MemberId']))
                        for x in sponsors]

            for typ, sponsor in sponsors:
                name = '{First} {Last}'.format(**dict(sponsor['Name']))
                bill.add_sponsorship(
                    name,
                    entity_type='person',
                    classification='primary'
                    if 'Author' in typ else 'secondary',
                    primary='Author' in typ,
                )

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x]
                    for x in ['Description', 'Url', 'Id', 'Version']
                ]
                link = bill.add_version_link(name,
                                             url,
                                             media_type='application/pdf')
                link['extras'] = {
                    '_internal_document_id': doc_id,
                    '_version_id': version_id
                }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(
                SOURCE_URL.format(**{
                    'session': session,
                    'bid': guid,
                }))

            yield bill
Ejemplo n.º 5
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 6
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE'] and
                    bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            for sponsor in bill_data['SPONSOR_NAMES']:
                stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                         else 'cosponsor')
                if sponsor:
                    bill.add_sponsorship(
                        name=sponsor,
                        entity_type='person',
                        primary=stype == 'primary',
                        classification=stype,
                    )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate'
                         else 'lower')

                date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning('unknown action code on %s: %s %s' %
                                 (bill_id, event['action_code'],
                                  event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(
                    action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype)

            try:
                yield from self.scrape_html(bill, session)
            except scrapelib.HTTPError as e:
                self.warning('unable to fetch HTML for bill {0}'.format(
                    bill['bill_id']))

            yield bill
Ejemplo n.º 7
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            'B': 'bill',
            'R': 'resolution',
            'JR': 'joint resolution',
            'CR': 'concurrent resolution',
        }

        chamber_map = {
            'H': 'lower',
            'S': 'upper',
            'J': 'joint',
            'E': 'legislature',  # Effective date
        }

        action_code_map = {
            'HI': None,
            'SI': None,
            'HH': None,
            'SH': None,
            'HPF': ['introduction'],
            'HDSAS': None,
            'SPF': ['introduction'],
            'HSR': ['reading-2'],
            'SSR': ['reading-2'],
            'HFR': ['reading-1'],
            'SFR': ['reading-1'],
            'HRECM': ['withdrawal', 'referral-committee'],
            'SRECM': ['withdrawal', 'referral-committee'],
            'SW&C': ['withdrawal', 'referral-committee'],
            'HW&C': ['withdrawal', 'referral-committee'],
            'HRA': ['passage'],
            'SRA': ['passage'],
            'HPA': ['passage'],
            'HRECO': None,
            'SPA': ['passage'],
            'HTABL': None,  # 'House Tabled' - what is this?
            'SDHAS': None,
            'HCFR': ['committee-passage-favorable'],
            'SCFR': ['committee-passage-favorable'],
            'HRAR': ['referral-committee'],
            'SRAR': ['referral-committee'],
            'STR': ['reading-3'],
            'SAHAS': None,
            'SE': ['passage'],
            'SR': ['referral-committee'],
            'HTRL': ['reading-3', 'failure'],
            'HTR': ['reading-3'],
            'S3RLT': ['reading-3', 'failure'],
            'HASAS': None,
            'S3RPP': None,
            'STAB': None,
            'SRECO': None,
            'SAPPT': None,
            'HCA': None,
            'HNOM': None,
            'HTT': None,
            'STT': None,
            'SRECP': None,
            'SCRA': None,
            'SNOM': None,
            'S2R': ['reading-2'],
            'H2R': ['reading-2'],
            'SENG': ['passage'],
            'HENG': ['passage'],
            'HPOST': None,
            'HCAP': None,
            'SDSG': ['executive-signature'],
            'SSG': ['executive-receipt'],
            'Signed Gov': ['executive-signature'],
            'HDSG': ['executive-signature'],
            'HSG': ['executive-receipt'],
            'EFF': None,
            'HRP': None,
            'STH': None,
            'HTS': None,
        }

        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(
            self.lservice.GetLegislationForSession,
            sid
        )['LegislationIndex']

        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]

            actions = reversed([{
                'code': x['Code'],
                'action': x['Description'],
                '_guid': x['Id'],
                'date': x['Date']
            } for x in history])

            guid = instrument['Id']

            # A little bit hacky.
            bill_prefix = instrument['DocumentType']
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = '%s %s' % (
                bill_prefix,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(
                bill_id, legislative_session=session, chamber=bill_chamber, title=title,
                classification=bill_type)
            bill.add_abstract(description, note='description')
            bill.extras = {'guid': guid}

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = VoteEvent(
                        start_date=vote_['Date'].strftime('%Y-%m-%d'),
                        motion_text=vote_['Caption'] or 'Vote on Bill',
                        chamber={'House': 'lower', 'Senate': 'upper'}[vote_['Branch']],
                        result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail',
                        classification='passage',
                        bill=bill,
                    )
                    vote.set_count('yes', vote_['Yeas'])
                    vote.set_count('no', vote_['Nays'])
                    vote.set_count('other', vote_['Excused'] + vote_['NotVoting'])

                    vote.add_source(self.vsource)

                    methods = {'Yea': 'yes', 'Nay': 'no'}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        vote.vote(methods.get(how, 'other'), whom['Name'])

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        'House': 'lower',
                        'Senate': 'upper',
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                action_chamber = chamber_map[action['code'][0]]

                try:
                    action_types = action_code_map[action['code']]
                except KeyError:
                    error_msg = 'Code {code} for action {action} not recognized.'.format(
                        code=action['code'], action=action['action'])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(('committee' in x for x in action_types)):
                    committees = [str(x) for x in ccommittees.get(
                        action_chamber, [])]

                act = bill.add_action(
                    action['action'], action['date'].strftime('%Y-%m-%d'),
                    classification=action_types,
                    chamber=action_chamber)
                for committee in committees:
                    act.add_related_entity(committee, 'organization')
                act.extras = {
                    'code': action['code'],
                    'guid': action['_guid'],
                }

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [
                (x['Type'], self.get_member(x['MemberId'])) for x in sponsors
            ]

            for typ, sponsor in sponsors:
                name = '{First} {Last}'.format(**dict(sponsor['Name']))
                bill.add_sponsorship(
                    name,
                    entity_type='person',
                    classification='primary' if 'Author' in typ else 'secondary',
                    primary='Author' in typ,
                )

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x] for x in [
                        'Description',
                        'Url',
                        'Id',
                        'Version'
                    ]
                ]
                # link = bill.add_version_link(
                #     name, url, media_type='application/pdf')
                # link['extras'] = {
                #     '_internal_document_id': doc_id,
                #     '_version_id': version_id
                # }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(SOURCE_URL.format(**{
                'session': session,
                'bid': guid,
            }))

            yield bill
Ejemplo n.º 8
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date'] :
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try :
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError :
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []) :
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title: #these are ominbus
                    bill.add_related_bill(identifier = related_bill['label'],
                                          legislative_session = bill.legislative_session,
                                          relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other
                
            for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
                if i == 0 :
                    primary = True
                    sponsorship_type = "Primary"
                else :
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 
                                            'Mendoza, Susana')) :
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm',)) :
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(('Misc. Transmittal',
                                                'No Sponsor',
                                                'Dept./Agency')) :
                    bill.add_sponsorship(sponsor_name, 
                                         sponsorship_type,
                                         entity_type,
                                         primary,
                                         entity_id = _make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details :
                for subject in leg_details[u'Topic'].split(',') :
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []) :
                if attachment['label'] :
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']) :
                action_description = action['Action']
                try :
                    action_date =  self.toTime(action['Date']).date().isoformat()
                except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description :
                    try :
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError  :
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council' :
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=ACTION_CLASSIFICATION[action_description])

                    if action_description == 'Referred' :
                        try :
                            leg_details['Current Controlling Legislative Body']['label']
                            controlling_bodies = [leg_details['Current Controlling Legislative Body']]
                        except TypeError :
                            controlling_bodies = leg_details['Current Controlling Legislative Body']
                        if controlling_bodies :
                            for controlling_body in controlling_bodies :
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee") :
                                    act.add_related_entity(body_name,
                                                           'organization')
                                else :
                                    act.add_related_entity(body_name,
                                                           'organization',
                                                           entity_id = _make_pseudo_id(name=body_name))


                    if 'url' in action['Action\xa0Details'] :
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                               motion_text=action_description,
                                               organization={'name': responsible_org},
                                               classification=None,
                                               start_date=action_date,
                                               result=result,
                                               bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes :
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification' : leg_summary['Type']}
                            
            yield bill
        print(unreachable_urls)
Ejemplo n.º 9
0
    def scrape(self):
        for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) :
            leg_type = BILL_TYPES[leg_summary['Type']]
            
            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name":"New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'], 
                           note='created by administrative staff')

            if 'Summary' in leg_details :
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number'] :
                bill.add_identifier(leg_details['Law number'], 
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) :
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor, sponsorship_type,
                                     'person', primary, 
                                     entity_id = make_pseudo_id(name=sponsor))

            
            for attachment in leg_details.get('Attachments', []) :
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history :
                earliest_action = min(self.toTime(action['Date']) 
                                      for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else :
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history :
                action_description = action['Action']
                if not action_description :
                    continue
                    
                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council' :
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration' :
                    responsible_org = 'Mayor'
                   
                if responsible_org == 'Town Hall Meeting' :
                    continue
                else :
                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=action_class)

                if 'url' in action['Action\xa0Details'] :
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral' :
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(referred_committee,
                                               'organization',
                                               entity_id = make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes :
                        action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action_description,
                                           organization={'name': responsible_org},
                                           classification=action_class,
                                           start_date=action_date,
                                           result=result,
                                           bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes :
                            action_vote.vote(option, voter)


                        yield action_vote
            
            text = self.text(leg_summary['url'])

            if text :
                bill.extras = {'local_classification' : leg_summary['Type'],
                               'full_text' : text}
            else :
                bill.extras = {'local_classification' : leg_summary['Type']}

            yield bill
Ejemplo n.º 10
0
    def scrape(self, window=30):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        self.retry_wait_seconds = 20

        for matter in self.matters(n_days_ago):
            matter_id = matter["MatterId"]

            date = matter["MatterIntroDate"]
            title = matter["MatterTitle"]
            identifier = matter["MatterFile"]

            # If a bill has a duplicate action item that"s causing the entire scrape
            # to fail, add it to the `problem_bills` array to skip it.
            # For the time being...nothing to skip!

            problem_bills = []

            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))

            if matter["MatterTypeName"] in BILL_TYPES:
                ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]]
            else:
                ocd_bill_type = None

            if identifier.startswith("S"):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=ocd_bill_type,
                        from_organization={"name": "Pittsburgh City Council"})

            legistar_web = matter["legistar_url"]
            legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id)
            bill.add_source(legistar_web, note="web")
            bill.add_source(legistar_api, note="api")

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop("responsible person")
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(responsible_person,
                                           "person",
                                           entity_id=_make_pseudo_id(name=responsible_person))

                if action["description"] == "Referred":
                    body_name = matter["MatterBodyName"]
                    if body_name != "City Council":
                        act.add_related_entity(body_name,
                                               "organization",
                                               entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote

                if result:
                    vote_event = VoteEvent(legislative_session=bill.legislative_session,
                                           motion_text=action["description"],
                                           organization=action["organization"],
                                           classification=None,
                                           start_date=action["date"],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + "/histories")

                    for vote in votes:
                        raw_option = vote["VoteValueName"].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option,
                                        vote["VotePersonName"].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic["MatterIndexName"].strip())

            for attachment in self.attachments(matter_id):
                if attachment["MatterAttachmentName"]:
                    bill.add_version_link(attachment["MatterAttachmentName"],
                                          attachment["MatterAttachmentHyperlink"],
                                          media_type="application/pdf")

            bill.extras = {"local_classification": matter["MatterTypeName"]}
            text = self.text(matter_id)

            if text:
                if text["MatterTextPlain"]:
                    bill.extras["plain_text"] = text["MatterTextPlain"]

                if text["MatterTextRtf"]:
                    bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "")

            yield bill
Ejemplo n.º 11
0
    def scrape(self) :
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council' :
                        act.add_related_entity(body_name,
                                               'organization',
                                               entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_version_link(attachment['MatterAttachmentName'],
                                          attachment['MatterAttachmentHyperlink'],
                                          media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Ejemplo n.º 12
0
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in self.matters(n_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            # There are currently no bills with duplicate action items! If a
            # bill has a duplicate action item that's causing the entire scrape
            # to fail, add it to the `problem_bills` array to skip it.

            problem_bills = []

            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = matter['legistar_url']

            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop('responsible person')
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(
                        responsible_person,
                        'person',
                        entity_id=_make_pseudo_id(name=responsible_person))

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 13
0
    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(
                    name = org_name,
                    start_time = start,
                    timezone = tz.zone,
                    location_name = event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                    )
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                    }
                e.add_participant(
                    name = org_name,
                    type = 'organization',
                    )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        wards = normalize_wards(item['wards'])
                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)
                        if full_identifier not in self.seen_agenda_items:
                            b = Bill(
                                # TODO: Fix this hardcode
                                legislative_session = '2014-2018',
                                identifier = full_identifier,
                                title = item['title'],
                                from_organization = {'name': self.jurisdiction.name},
                                )
                            b.add_source(agenda_url)
                            b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier))
                            b.extras = {
                                'wards': wards,
                                }

                            self.seen_agenda_items.append(full_identifier)

                            yield b

                yield e
Ejemplo n.º 14
0
    def scrape(self, session=None, chamber=None):
        bill_type_map = {
            "B": "bill",
            "R": "resolution",
            "JR": "joint resolution",
            "CR": "concurrent resolution",
        }

        chamber_map = {
            "H": "lower",
            "S": "upper",
            "J": "joint",
            "E": "legislature",  # Effective date
        }

        action_code_map = {
            "HI": None,
            "SI": None,
            "HH": None,
            "SH": None,
            "HPF": ["introduction"],
            "HDSAS": None,
            "SPF": ["introduction"],
            "HSR": ["reading-2"],
            "SSR": ["reading-2"],
            "HFR": ["reading-1"],
            "SFR": ["reading-1"],
            "HRECM": ["withdrawal", "referral-committee"],
            "SRECM": ["withdrawal", "referral-committee"],
            "SW&C": ["withdrawal", "referral-committee"],
            "HW&C": ["withdrawal", "referral-committee"],
            "HRA": ["passage"],
            "SRA": ["passage"],
            "HPA": ["passage"],
            "HRECO": None,
            "SPA": ["passage"],
            "HTABL": None,  # 'House Tabled' - what is this?
            "SDHAS": None,
            "HCFR": ["committee-passage-favorable"],
            "SCFR": ["committee-passage-favorable"],
            "HRAR": ["referral-committee"],
            "SRAR": ["referral-committee"],
            "STR": ["reading-3"],
            "SAHAS": None,
            "SE": ["passage"],
            "SR": ["referral-committee"],
            "HTRL": ["reading-3", "failure"],
            "HTR": ["reading-3"],
            "S3RLT": ["reading-3", "failure"],
            "HASAS": None,
            "S3RPP": None,
            "STAB": None,
            "SRECO": None,
            "SAPPT": None,
            "HCA": None,
            "HNOM": None,
            "HTT": None,
            "STT": None,
            "SRECP": None,
            "SCRA": None,
            "SNOM": None,
            "S2R": ["reading-2"],
            "H2R": ["reading-2"],
            "SENG": ["passage"],
            "HENG": ["passage"],
            "HPOST": None,
            "HCAP": None,
            "SDSG": ["executive-signature"],
            "SSG": ["executive-receipt"],
            "Signed Gov": ["executive-signature"],
            "HDSG": ["executive-signature"],
            "HSG": ["executive-receipt"],
            "EFF": None,
            "HRP": None,
            "STH": None,
            "HTS": None,
        }

        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        sid = SESSION_SITE_IDS[session]

        legislation = backoff(self.lservice.GetLegislationForSession,
                              sid)["LegislationIndex"]

        for leg in legislation:
            lid = leg["Id"]
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument["StatusHistory"][0]]

            actions = reversed([{
                "code": x["Code"],
                "action": x["Description"],
                "_guid": x["Id"],
                "date": x["Date"],
            } for x in history])

            guid = instrument["Id"]

            # A little bit hacky.
            bill_prefix = instrument["DocumentType"]
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = "%s %s" % (bill_prefix, instrument["Number"])
            if instrument["Suffix"]:
                bill_id += instrument["Suffix"]

            title = instrument["Caption"]
            description = instrument["Summary"]

            if title is None:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=title,
                classification=bill_type,
            )
            bill.add_abstract(description, note="description")
            bill.extras = {"guid": guid}

            if instrument["Votes"]:
                for vote_ in instrument["Votes"]:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"])

                    vote = VoteEvent(
                        start_date=vote_["Date"].strftime("%Y-%m-%d"),
                        motion_text=vote_["Caption"] or "Vote on Bill",
                        chamber={
                            "House": "lower",
                            "Senate": "upper"
                        }[vote_["Branch"]],
                        result="pass"
                        if vote_["Yeas"] > vote_["Nays"] else "fail",
                        classification="passage",
                        bill=bill,
                    )
                    vote.set_count("yes", vote_["Yeas"])
                    vote.set_count("no", vote_["Nays"])
                    vote.set_count("other",
                                   vote_["Excused"] + vote_["NotVoting"])

                    vote.add_source(self.vsource)

                    methods = {"Yea": "yes", "Nay": "no"}

                    for vdetail in vote_["Votes"][0]:
                        whom = vdetail["Member"]
                        how = vdetail["MemberVoted"]
                        if whom["Name"] == "VACANT":
                            continue
                        name, district = vote_name_pattern.search(
                            whom["Name"]).groups()
                        vote.vote(methods.get(how, "other"),
                                  name,
                                  note=district)

                    yield vote

            ccommittees = defaultdict(list)
            committees = instrument["Committees"]
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        "House": "lower",
                        "Senate": "upper"
                    }[committee["Type"]]].append(committee["Name"])

            for action in actions:
                action_chamber = chamber_map[action["code"][0]]

                try:
                    action_types = action_code_map[action["code"]]
                except KeyError:
                    error_msg = "Code {code} for action {action} not recognized.".format(
                        code=action["code"], action=action["action"])

                    self.logger.warning(error_msg)

                    action_types = None

                committees = []
                if action_types and any(
                    ("committee" in x for x in action_types)):
                    committees = [
                        str(x) for x in ccommittees.get(action_chamber, [])
                    ]

                act = bill.add_action(
                    action["action"],
                    action["date"].strftime("%Y-%m-%d"),
                    classification=action_types,
                    chamber=action_chamber,
                )
                for committee in committees:
                    act.add_related_entity(committee, "organization")
                act.extras = {"code": action["code"], "guid": action["_guid"]}

            sponsors = []
            if instrument["Authors"]:
                sponsors = instrument["Authors"]["Sponsorship"]
                if "Sponsors" in instrument and instrument["Sponsors"]:
                    sponsors += instrument["Sponsors"]["Sponsorship"]

            sponsors = [(x["Type"], self.get_member(x["MemberId"]))
                        for x in sponsors]

            for typ, sponsor in sponsors:
                name = "{First} {Last}".format(**dict(sponsor["Name"]))
                bill.add_sponsorship(
                    name,
                    entity_type="person",
                    classification="primary"
                    if "Author" in typ else "secondary",
                    primary="Author" in typ,
                )

            for version in instrument["Versions"]["DocumentDescription"]:
                name, url, doc_id, version_id = [
                    version[x]
                    for x in ["Description", "Url", "Id", "Version"]
                ]
                link = bill.add_version_link(name,
                                             url,
                                             media_type="application/pdf")
                link["extras"] = {
                    "_internal_document_id": doc_id,
                    "_version_id": version_id,
                }

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(
                SOURCE_URL.format(**{
                    "session": session,
                    "bid": guid
                }))

            yield bill
Ejemplo n.º 15
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE'] and
                    bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data['ORIGINAL_SPONSOR']:
                bill.add_sponsorship(
                    name=primary_sponsor,
                    entity_type='organization' if "committee" in primary_sponsor.lower()
                                else 'person',
                    primary=True,
                    classification="original sponsor"
                )
            for sponsor in bill_data['SPONSOR_NAMES']:
                if sponsor in bill_data['ORIGINAL_SPONSOR']:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='organization' if "committee" in sponsor.lower() else 'person',
                    primary=False,
                    classification='cosponsor',
                )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate'
                         else 'lower')

                date = event['session_date']
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning('unknown action code on %s: %s %s' %
                                 (bill_id, event['action_code'],
                                  event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(
                    action, date, chamber=actor, classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
Ejemplo n.º 16
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.

        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # Skip this bill, until Metro cleans up duplicate in Legistar API
            if matter['MatterFile'] == '2017-0447':
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            # Do not scrape private bills introduced before this timestamp.
            if self._is_restricted(matter) and (
                    date < self.START_DATE_PRIVATE_SCRAPE):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            # The Metro scraper scrapes private bills.
            # However, we do not want to capture significant data about private bills,
            # other than the value of the helper function `_is_restricted` and a last modified timestamp.
            # We yield private bills early, wipe data from previously imported once-public bills,
            # and include only data *required* by the pupa schema.
            # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
            bill.extras = {'restrict_view': self._is_restricted(matter)}

            # Add API source early.
            # Private bills should have this url for debugging.
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
            bill.add_source(legistar_api, note='api')

            if self._is_restricted(matter):
                # required fields
                bill.title = 'Restricted View'

                # wipe old data
                bill.extras['plain_text'] = ''
                bill.extras['rtf_text'] = ''
                bill.sponsorships = []
                bill.related_bills = []
                bill.versions = []
                bill.documents = []
                bill.actions = []

                yield bill
                continue

            legistar_web = matter['legistar_url']
            bill.add_source(legistar_web, note='web')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        try:
                            raw_option = vote['VoteValueName'].lower()
                        except AttributeError:
                            raw_option = None
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'].strip(),
                        media_type="application/pdf")

            bill.extras['local_classification'] = matter['MatterTypeName']

            matter_version_value = matter['MatterVersion']
            text = self.text(matter_id, matter_version_value)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 17
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.
        
        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # If this Boolean field is True, then do not scrape the Bill.
            # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API:
            # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826
            if matter['MatterRestrictViewViaWeb']:
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = matter['legistar_url']

            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 18
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE']
                    and bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data['ORIGINAL_SPONSOR']:
                bill.add_sponsorship(name=primary_sponsor,
                                     entity_type='organization' if "committee"
                                     in primary_sponsor.lower() else 'person',
                                     primary=True,
                                     classification="original sponsor")
            for sponsor in bill_data['SPONSOR_NAMES']:
                if sponsor in bill_data['ORIGINAL_SPONSOR']:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='organization'
                    if "committee" in sponsor.lower() else 'person',
                    primary=False,
                    classification='cosponsor',
                )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate' else 'lower')

                date = event['session_date']
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning(
                        'unknown action code on %s: %s %s' %
                        (bill_id, event['action_code'], event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
Ejemplo n.º 19
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(
                created_after=datetime.datetime(2015, 5, 17)):
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date']:
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(
                self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try:
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError:
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []):
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title:  #these are ominbus
                    bill.add_related_bill(
                        identifier=related_bill['label'],
                        legislative_session=bill.legislative_session,
                        relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other

            for i, sponsor in enumerate(leg_details.get('Sponsors', [])):
                if i == 0:
                    primary = True
                    sponsorship_type = "Primary"
                else:
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')):
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm', )):
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(
                    ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')):
                    bill.add_sponsorship(
                        sponsor_name,
                        sponsorship_type,
                        entity_type,
                        primary,
                        entity_id=_make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details:
                for subject in leg_details[u'Topic'].split(','):
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []):
                if attachment['label']:
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']):
                action_description = action['Action']
                try:
                    action_date = self.toTime(
                        action['Date']).date().isoformat()
                except AttributeError:  # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description:
                    try:
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError:
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council':
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=ACTION_CLASSIFICATION[
                            action_description])

                    if action_description == 'Referred':
                        try:
                            leg_details[
                                'Current Controlling Legislative Body'][
                                    'label']
                            controlling_bodies = [
                                leg_details[
                                    'Current Controlling Legislative Body']
                            ]
                        except TypeError:
                            controlling_bodies = leg_details[
                                'Current Controlling Legislative Body']
                        if controlling_bodies:
                            for controlling_body in controlling_bodies:
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee"):
                                    act.add_related_entity(
                                        body_name, 'organization')
                                else:
                                    act.add_related_entity(
                                        body_name,
                                        'organization',
                                        entity_id=_make_pseudo_id(
                                            name=body_name))

                    if 'url' in action['Action\xa0Details']:
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result:  # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(
                                legislative_session=bill.legislative_session,
                                motion_text=action_description,
                                organization={'name': responsible_org},
                                classification=None,
                                start_date=action_date,
                                result=result,
                                bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes:
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
        print(unreachable_urls)
Ejemplo n.º 20
0
    def scrape_chamber(self, chamber, session):
        chamber_name = "Senate" if chamber == "upper" else "House"
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + "bill_status/").text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json["content"]
        for bill_data in bills:

            bill_id = bill_data["BILLNO"]

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if "CR" in bill_id:
                btype = "concurrent resolution"
            elif "R" in bill_id:
                btype = "resolution"
            elif "B" in bill_id:
                btype = "bill"

            title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"]

            # main
            bill = Bill(bill_id,
                        session,
                        title,
                        chamber=chamber,
                        classification=btype)
            bill.extras = {"status": bill_data["STATUS"]}

            bill.add_source(ksapi.url + "bill_status/" + bill_id.lower())

            if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title:
                bill.add_title(bill_data["LONGTITLE"])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
                bill.add_sponsorship(
                    name=primary_sponsor,
                    entity_type="organization"
                    if "committee" in primary_sponsor.lower() else "person",
                    primary=True,
                    classification="original sponsor",
                )
            for sponsor in bill_data["SPONSOR_NAMES"]:
                if sponsor in bill_data["ORIGINAL_SPONSOR"]:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type="organization"
                    if "committee" in sponsor.lower() else "person",
                    primary=False,
                    classification="cosponsor",
                )

            # history is backwards
            for event in reversed(bill_data["HISTORY"]):
                actor = "upper" if event["chamber"] == "Senate" else "lower"

                date = event["session_date"]
                # append committee names if present
                if "committee_names" in event:
                    action = (event["status"] + " " +
                              " and ".join(event["committee_names"]))
                else:
                    action = event["status"]

                if event["action_code"] not in ksapi.action_codes:
                    self.warning(
                        "unknown action code on %s: %s %s" %
                        (bill_id, event["action_code"], event["status"]))
                    atype = None
                else:
                    atype = ksapi.action_codes[event["action_code"]]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
Ejemplo n.º 21
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']

            if not all((date, title)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            bill = Bill(identifier=matter['MatterFile'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 22
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor,
                                     sponsorship_type,
                                     'person',
                                     primary,
                                     entity_id=_make_pseudo_id(name=sponsor))

            for attachment in leg_details.get('Attachments', []):
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
Ejemplo n.º 23
0
    def scrape_events_range(self, start_date, end_date):
        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(
                    date.replace(hour=time.hour,
                                 minute=time.minute,
                                 second=0,
                                 microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(
                    start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(name=org_name,
                          start_time=start,
                          timezone=tz.zone,
                          location_name=event['location'],
                          status=STATUS_DICT.get(event['meeting_status']))
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                }
                e.add_participant(
                    name=org_name,
                    type='organization',
                )

                def is_agenda_available(event):
                    return event['publishing_status'] in [
                        'Agenda Published', 'Minutes Published'
                    ]

                def is_council(event):
                    return True if event[
                        'meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(
                        event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(
                        self.full_identifiers(event['meeting_id'],
                                              is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def is_vote_event(item):
                            return True if item['type'] == 'ACTION' else False

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        def is_being_introduced(item, event):
                            org_name = event['meeting']
                            identifier = item['identifier']

                            # `org_code` is two-letter code for committee
                            current_org_code = self.committees_by_name.get(
                                org_name)[0]['code']
                            originating_org_code = re.search(
                                r'([A-Z]{2})[0-9]+\.[0-9]+',
                                identifier).group(1)

                            return current_org_code == originating_org_code

                        if is_vote_event(item):
                            wards = normalize_wards(item['wards'])
                            identifier_regex = re.compile(
                                r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                            [full_identifier] = [
                                id for id in full_identifiers
                                if identifier_regex.match(id).group(1) ==
                                item['identifier']
                            ]
                            a.add_bill(full_identifier)
                            if is_being_introduced(item, event):
                                b = Bill(
                                    # TODO: Fix this hardcode
                                    legislative_session='2014-2018',
                                    identifier=full_identifier,
                                    title=item['title'],
                                    from_organization={'name': org_name},
                                )
                                b.add_source(agenda_url)
                                b.add_document_link(
                                    note='canonical',
                                    media_type='text/html',
                                    url=AGENDA_ITEM_TEMPLATE.format(
                                        full_identifier))
                                b.extras = {
                                    'wards': wards,
                                }

                                yield b

                yield e