def scrape(self, window=28) : n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for matter in self.matters(n_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link('Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', primary=stype == 'primary', classification=stype, ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action(action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype) try: yield from self.scrape_html(bill, session) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) yield bill
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in self.matters(n_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] # Temporarily, we should not scrape or import these bills: # https://chicago.legistar.com/LegislationDetail.aspx?ID=3291304&GUID=72ACF5FE-0803-46E8-90B4-604119803293 # They have duplicate action items, which cause the entire scrape # to fail. The Chicago clerk's office should fix it in the near # future, after which we can remove this code. problem_bills = ['CL2017-1281'] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = matter['legistar_url'] legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop('responsible person') act = bill.add_action(**action) if responsible_person: act.add_related_entity( responsible_person, 'person', entity_id=_make_pseudo_id(name=responsible_person)) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self, session=None, chamber=None): bill_type_map = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', } chamber_map = { 'H': 'lower', 'S': 'upper', 'J': 'joint', 'E': 'legislature', # Effective date } action_code_map = { 'HI': None, 'SI': None, 'HH': None, 'SH': None, 'HPF': ['introduction'], 'HDSAS': None, 'SPF': ['introduction'], 'HSR': ['reading-2'], 'SSR': ['reading-2'], 'HFR': ['reading-1'], 'SFR': ['reading-1'], 'HRECM': ['withdrawal', 'referral-committee'], 'SRECM': ['withdrawal', 'referral-committee'], 'SW&C': ['withdrawal', 'referral-committee'], 'HW&C': ['withdrawal', 'referral-committee'], 'HRA': ['passage'], 'SRA': ['passage'], 'HPA': ['passage'], 'HRECO': None, 'SPA': ['passage'], 'HTABL': None, # 'House Tabled' - what is this? 'SDHAS': None, 'HCFR': ['committee-passage-favorable'], 'SCFR': ['committee-passage-favorable'], 'HRAR': ['referral-committee'], 'SRAR': ['referral-committee'], 'STR': ['reading-3'], 'SAHAS': None, 'SE': ['passage'], 'SR': ['referral-committee'], 'HTRL': ['reading-3', 'failure'], 'HTR': ['reading-3'], 'S3RLT': ['reading-3', 'failure'], 'HASAS': None, 'S3RPP': None, 'STAB': None, 'SRECO': None, 'SAPPT': None, 'HCA': None, 'HNOM': None, 'HTT': None, 'STT': None, 'SRECP': None, 'SCRA': None, 'SNOM': None, 'S2R': ['reading-2'], 'H2R': ['reading-2'], 'SENG': ['passage'], 'HENG': ['passage'], 'HPOST': None, 'HCAP': None, 'SDSG': ['executive-signature'], 'SSG': ['executive-receipt'], 'Signed Gov': ['executive-signature'], 'HDSG': ['executive-signature'], 'HSG': ['executive-receipt'], 'EFF': None, 'HRP': None, 'STH': None, 'HTS': None, } if not session: session = self.latest_session() self.info('no session specified, using %s', session) sid = SESSION_SITE_IDS[session] legislation = backoff(self.lservice.GetLegislationForSession, sid)['LegislationIndex'] for leg in legislation: lid = leg['Id'] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument['StatusHistory'][0]] actions = reversed([{ 'code': x['Code'], 'action': x['Description'], '_guid': x['Id'], 'date': x['Date'] } for x in history]) guid = instrument['Id'] # A little bit hacky. bill_prefix = instrument['DocumentType'] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = '%s %s' % ( bill_prefix, instrument['Number'], ) if instrument['Suffix']: bill_id += instrument['Suffix'] title = instrument['Caption'] description = instrument['Summary'] if title is None: continue bill = Bill(bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type) bill.add_abstract(description, note='description') bill.extras = {'guid': guid} if instrument['Votes']: for vote_ in instrument['Votes']: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId']) vote = VoteEvent( start_date=vote_['Date'].strftime('%Y-%m-%d'), motion_text=vote_['Caption'] or 'Vote on Bill', chamber={ 'House': 'lower', 'Senate': 'upper' }[vote_['Branch']], result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', vote_['Yeas']) vote.set_count('no', vote_['Nays']) vote.set_count('other', vote_['Excused'] + vote_['NotVoting']) vote.add_source(self.vsource) methods = {'Yea': 'yes', 'Nay': 'no'} for vdetail in vote_['Votes'][0]: whom = vdetail['Member'] how = vdetail['MemberVoted'] vote.vote(methods.get(how, 'other'), whom['Name']) yield vote ccommittees = defaultdict(list) committees = instrument['Committees'] if committees: for committee in committees[0]: ccommittees[{ 'House': 'lower', 'Senate': 'upper', }[committee['Type']]].append(committee['Name']) for action in actions: action_chamber = chamber_map[action['code'][0]] try: action_types = action_code_map[action['code']] except KeyError: error_msg = 'Code {code} for action {action} not recognized.'.format( code=action['code'], action=action['action']) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any( ('committee' in x for x in action_types)): committees = [ str(x) for x in ccommittees.get(action_chamber, []) ] act = bill.add_action(action['action'], action['date'].strftime('%Y-%m-%d'), classification=action_types, chamber=action_chamber) for committee in committees: act.add_related_entity(committee, 'organization') act.extras = { 'code': action['code'], 'guid': action['_guid'], } sponsors = [] if instrument['Authors']: sponsors = instrument['Authors']['Sponsorship'] if 'Sponsors' in instrument and instrument['Sponsors']: sponsors += instrument['Sponsors']['Sponsorship'] sponsors = [(x['Type'], self.get_member(x['MemberId'])) for x in sponsors] for typ, sponsor in sponsors: name = '{First} {Last}'.format(**dict(sponsor['Name'])) bill.add_sponsorship( name, entity_type='person', classification='primary' if 'Author' in typ else 'secondary', primary='Author' in typ, ) for version in instrument['Versions']['DocumentDescription']: name, url, doc_id, version_id = [ version[x] for x in ['Description', 'Url', 'Id', 'Version'] ] link = bill.add_version_link(name, url, media_type='application/pdf') link['extras'] = { '_internal_document_id': doc_id, '_version_id': version_id } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source( SOURCE_URL.format(**{ 'session': session, 'bid': guid, })) yield bill
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', primary=stype == 'primary', classification=stype, ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype) try: yield from self.scrape_html(bill, session) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) yield bill
def scrape(self, session=None, chamber=None): bill_type_map = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', } chamber_map = { 'H': 'lower', 'S': 'upper', 'J': 'joint', 'E': 'legislature', # Effective date } action_code_map = { 'HI': None, 'SI': None, 'HH': None, 'SH': None, 'HPF': ['introduction'], 'HDSAS': None, 'SPF': ['introduction'], 'HSR': ['reading-2'], 'SSR': ['reading-2'], 'HFR': ['reading-1'], 'SFR': ['reading-1'], 'HRECM': ['withdrawal', 'referral-committee'], 'SRECM': ['withdrawal', 'referral-committee'], 'SW&C': ['withdrawal', 'referral-committee'], 'HW&C': ['withdrawal', 'referral-committee'], 'HRA': ['passage'], 'SRA': ['passage'], 'HPA': ['passage'], 'HRECO': None, 'SPA': ['passage'], 'HTABL': None, # 'House Tabled' - what is this? 'SDHAS': None, 'HCFR': ['committee-passage-favorable'], 'SCFR': ['committee-passage-favorable'], 'HRAR': ['referral-committee'], 'SRAR': ['referral-committee'], 'STR': ['reading-3'], 'SAHAS': None, 'SE': ['passage'], 'SR': ['referral-committee'], 'HTRL': ['reading-3', 'failure'], 'HTR': ['reading-3'], 'S3RLT': ['reading-3', 'failure'], 'HASAS': None, 'S3RPP': None, 'STAB': None, 'SRECO': None, 'SAPPT': None, 'HCA': None, 'HNOM': None, 'HTT': None, 'STT': None, 'SRECP': None, 'SCRA': None, 'SNOM': None, 'S2R': ['reading-2'], 'H2R': ['reading-2'], 'SENG': ['passage'], 'HENG': ['passage'], 'HPOST': None, 'HCAP': None, 'SDSG': ['executive-signature'], 'SSG': ['executive-receipt'], 'Signed Gov': ['executive-signature'], 'HDSG': ['executive-signature'], 'HSG': ['executive-receipt'], 'EFF': None, 'HRP': None, 'STH': None, 'HTS': None, } if not session: session = self.latest_session() self.info('no session specified, using %s', session) sid = SESSION_SITE_IDS[session] legislation = backoff( self.lservice.GetLegislationForSession, sid )['LegislationIndex'] for leg in legislation: lid = leg['Id'] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument['StatusHistory'][0]] actions = reversed([{ 'code': x['Code'], 'action': x['Description'], '_guid': x['Id'], 'date': x['Date'] } for x in history]) guid = instrument['Id'] # A little bit hacky. bill_prefix = instrument['DocumentType'] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = '%s %s' % ( bill_prefix, instrument['Number'], ) if instrument['Suffix']: bill_id += instrument['Suffix'] title = instrument['Caption'] description = instrument['Summary'] if title is None: continue bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type) bill.add_abstract(description, note='description') bill.extras = {'guid': guid} if instrument['Votes']: for vote_ in instrument['Votes']: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId']) vote = VoteEvent( start_date=vote_['Date'].strftime('%Y-%m-%d'), motion_text=vote_['Caption'] or 'Vote on Bill', chamber={'House': 'lower', 'Senate': 'upper'}[vote_['Branch']], result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', vote_['Yeas']) vote.set_count('no', vote_['Nays']) vote.set_count('other', vote_['Excused'] + vote_['NotVoting']) vote.add_source(self.vsource) methods = {'Yea': 'yes', 'Nay': 'no'} for vdetail in vote_['Votes'][0]: whom = vdetail['Member'] how = vdetail['MemberVoted'] vote.vote(methods.get(how, 'other'), whom['Name']) yield vote ccommittees = defaultdict(list) committees = instrument['Committees'] if committees: for committee in committees[0]: ccommittees[{ 'House': 'lower', 'Senate': 'upper', }[committee['Type']]].append(committee['Name']) for action in actions: action_chamber = chamber_map[action['code'][0]] try: action_types = action_code_map[action['code']] except KeyError: error_msg = 'Code {code} for action {action} not recognized.'.format( code=action['code'], action=action['action']) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any(('committee' in x for x in action_types)): committees = [str(x) for x in ccommittees.get( action_chamber, [])] act = bill.add_action( action['action'], action['date'].strftime('%Y-%m-%d'), classification=action_types, chamber=action_chamber) for committee in committees: act.add_related_entity(committee, 'organization') act.extras = { 'code': action['code'], 'guid': action['_guid'], } sponsors = [] if instrument['Authors']: sponsors = instrument['Authors']['Sponsorship'] if 'Sponsors' in instrument and instrument['Sponsors']: sponsors += instrument['Sponsors']['Sponsorship'] sponsors = [ (x['Type'], self.get_member(x['MemberId'])) for x in sponsors ] for typ, sponsor in sponsors: name = '{First} {Last}'.format(**dict(sponsor['Name'])) bill.add_sponsorship( name, entity_type='person', classification='primary' if 'Author' in typ else 'secondary', primary='Author' in typ, ) for version in instrument['Versions']['DocumentDescription']: name, url, doc_id, version_id = [ version[x] for x in [ 'Description', 'Url', 'Id', 'Version' ] ] # link = bill.add_version_link( # name, url, media_type='application/pdf') # link['extras'] = { # '_internal_document_id': doc_id, # '_version_id': version_id # } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source(SOURCE_URL.format(**{ 'session': session, 'bid': guid, })) yield bill
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) : title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date'] : continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) bill.add_source(leg_summary['url']) try : leg_details = self.legDetails(leg_summary['url']) except IndexError : unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []) : lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')) : sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm',)) : sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith(('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')) : bill.add_sponsorship(sponsor_name, sponsorship_type, entity_type, primary, entity_id = _make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details : for subject in leg_details[u'Topic'].split(',') : bill.add_subject(subject) for attachment in leg_details.get('Attachments', []) : if attachment['label'] : bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']) : action_description = action['Action'] try : action_date = self.toTime(action['Date']).date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : try : responsible_org = action['Action\xa0By']['label'] except TypeError : responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'Chicago City Council' act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[action_description]) if action_description == 'Referred' : try : leg_details['Current Controlling Legislative Body']['label'] controlling_bodies = [leg_details['Current Controlling Legislative Body']] except TypeError : controlling_bodies = leg_details['Current Controlling Legislative Body'] if controlling_bodies : for controlling_body in controlling_bodies : body_name = controlling_body['label'] if body_name.startswith("Joint Committee") : act.add_related_entity(body_name, 'organization') else : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification' : leg_summary['Type']} yield bill print(unreachable_urls)
def scrape(self): for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) : leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name":"New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details : bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number'] : bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) : sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id = make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []) : bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history : earliest_action = min(self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else : bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history : action_description = action['Action'] if not action_description : continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'New York City Council' elif responsible_org == 'Administration' : responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting' : continue else : act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral' : action_details = self.actionDetails(action_detail_url) referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity(referred_committee, 'organization', entity_id = make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes : action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text : bill.extras = {'local_classification' : leg_summary['Type'], 'full_text' : text} else : bill.extras = {'local_classification' : leg_summary['Type']} yield bill
def scrape(self, window=30): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) self.retry_wait_seconds = 20 for matter in self.matters(n_days_ago): matter_id = matter["MatterId"] date = matter["MatterIntroDate"] title = matter["MatterTitle"] identifier = matter["MatterFile"] # If a bill has a duplicate action item that"s causing the entire scrape # to fail, add it to the `problem_bills` array to skip it. # For the time being...nothing to skip! problem_bills = [] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) if matter["MatterTypeName"] in BILL_TYPES: ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]] else: ocd_bill_type = None if identifier.startswith("S"): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=ocd_bill_type, from_organization={"name": "Pittsburgh City Council"}) legistar_web = matter["legistar_url"] legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id) bill.add_source(legistar_web, note="web") bill.add_source(legistar_api, note="api") for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop("responsible person") act = bill.add_action(**action) if responsible_person: act.add_related_entity(responsible_person, "person", entity_id=_make_pseudo_id(name=responsible_person)) if action["description"] == "Referred": body_name = matter["MatterBodyName"] if body_name != "City Council": act.add_related_entity(body_name, "organization", entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action["description"], organization=action["organization"], classification=None, start_date=action["date"], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + "/histories") for vote in votes: raw_option = vote["VoteValueName"].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote["VotePersonName"].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic["MatterIndexName"].strip()) for attachment in self.attachments(matter_id): if attachment["MatterAttachmentName"]: bill.add_version_link(attachment["MatterAttachmentName"], attachment["MatterAttachmentHyperlink"], media_type="application/pdf") bill.extras = {"local_classification": matter["MatterTypeName"]} text = self.text(matter_id) if text: if text["MatterTextPlain"]: bill.extras["plain_text"] = text["MatterTextPlain"] if text["MatterTextRtf"]: bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "") yield bill
def scrape(self) : three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] if body_name != 'City Council' : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_version_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in self.matters(n_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] # There are currently no bills with duplicate action items! If a # bill has a duplicate action item that's causing the entire scrape # to fail, add it to the `problem_bills` array to skip it. problem_bills = [] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = matter['legistar_url'] legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop('responsible person') act = bill.add_action(**action) if responsible_person: act.add_related_entity( responsible_person, 'person', entity_id=_make_pseudo_id(name=responsible_person)) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day) org_name = event['meeting'] e = Event( name = org_name, start_time = start, timezone = tz.zone, location_name = event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name = org_name, type = 'organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') wards = normalize_wards(item['wards']) identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) if full_identifier not in self.seen_agenda_items: b = Bill( # TODO: Fix this hardcode legislative_session = '2014-2018', identifier = full_identifier, title = item['title'], from_organization = {'name': self.jurisdiction.name}, ) b.add_source(agenda_url) b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier)) b.extras = { 'wards': wards, } self.seen_agenda_items.append(full_identifier) yield b yield e
def scrape(self, session=None, chamber=None): bill_type_map = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", } chamber_map = { "H": "lower", "S": "upper", "J": "joint", "E": "legislature", # Effective date } action_code_map = { "HI": None, "SI": None, "HH": None, "SH": None, "HPF": ["introduction"], "HDSAS": None, "SPF": ["introduction"], "HSR": ["reading-2"], "SSR": ["reading-2"], "HFR": ["reading-1"], "SFR": ["reading-1"], "HRECM": ["withdrawal", "referral-committee"], "SRECM": ["withdrawal", "referral-committee"], "SW&C": ["withdrawal", "referral-committee"], "HW&C": ["withdrawal", "referral-committee"], "HRA": ["passage"], "SRA": ["passage"], "HPA": ["passage"], "HRECO": None, "SPA": ["passage"], "HTABL": None, # 'House Tabled' - what is this? "SDHAS": None, "HCFR": ["committee-passage-favorable"], "SCFR": ["committee-passage-favorable"], "HRAR": ["referral-committee"], "SRAR": ["referral-committee"], "STR": ["reading-3"], "SAHAS": None, "SE": ["passage"], "SR": ["referral-committee"], "HTRL": ["reading-3", "failure"], "HTR": ["reading-3"], "S3RLT": ["reading-3", "failure"], "HASAS": None, "S3RPP": None, "STAB": None, "SRECO": None, "SAPPT": None, "HCA": None, "HNOM": None, "HTT": None, "STT": None, "SRECP": None, "SCRA": None, "SNOM": None, "S2R": ["reading-2"], "H2R": ["reading-2"], "SENG": ["passage"], "HENG": ["passage"], "HPOST": None, "HCAP": None, "SDSG": ["executive-signature"], "SSG": ["executive-receipt"], "Signed Gov": ["executive-signature"], "HDSG": ["executive-signature"], "HSG": ["executive-receipt"], "EFF": None, "HRP": None, "STH": None, "HTS": None, } if not session: session = self.latest_session() self.info("no session specified, using %s", session) sid = SESSION_SITE_IDS[session] legislation = backoff(self.lservice.GetLegislationForSession, sid)["LegislationIndex"] for leg in legislation: lid = leg["Id"] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument["StatusHistory"][0]] actions = reversed([{ "code": x["Code"], "action": x["Description"], "_guid": x["Id"], "date": x["Date"], } for x in history]) guid = instrument["Id"] # A little bit hacky. bill_prefix = instrument["DocumentType"] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = "%s %s" % (bill_prefix, instrument["Number"]) if instrument["Suffix"]: bill_id += instrument["Suffix"] title = instrument["Caption"] description = instrument["Summary"] if title is None: continue bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type, ) bill.add_abstract(description, note="description") bill.extras = {"guid": guid} if instrument["Votes"]: for vote_ in instrument["Votes"]: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"]) vote = VoteEvent( start_date=vote_["Date"].strftime("%Y-%m-%d"), motion_text=vote_["Caption"] or "Vote on Bill", chamber={ "House": "lower", "Senate": "upper" }[vote_["Branch"]], result="pass" if vote_["Yeas"] > vote_["Nays"] else "fail", classification="passage", bill=bill, ) vote.set_count("yes", vote_["Yeas"]) vote.set_count("no", vote_["Nays"]) vote.set_count("other", vote_["Excused"] + vote_["NotVoting"]) vote.add_source(self.vsource) methods = {"Yea": "yes", "Nay": "no"} for vdetail in vote_["Votes"][0]: whom = vdetail["Member"] how = vdetail["MemberVoted"] if whom["Name"] == "VACANT": continue name, district = vote_name_pattern.search( whom["Name"]).groups() vote.vote(methods.get(how, "other"), name, note=district) yield vote ccommittees = defaultdict(list) committees = instrument["Committees"] if committees: for committee in committees[0]: ccommittees[{ "House": "lower", "Senate": "upper" }[committee["Type"]]].append(committee["Name"]) for action in actions: action_chamber = chamber_map[action["code"][0]] try: action_types = action_code_map[action["code"]] except KeyError: error_msg = "Code {code} for action {action} not recognized.".format( code=action["code"], action=action["action"]) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any( ("committee" in x for x in action_types)): committees = [ str(x) for x in ccommittees.get(action_chamber, []) ] act = bill.add_action( action["action"], action["date"].strftime("%Y-%m-%d"), classification=action_types, chamber=action_chamber, ) for committee in committees: act.add_related_entity(committee, "organization") act.extras = {"code": action["code"], "guid": action["_guid"]} sponsors = [] if instrument["Authors"]: sponsors = instrument["Authors"]["Sponsorship"] if "Sponsors" in instrument and instrument["Sponsors"]: sponsors += instrument["Sponsors"]["Sponsorship"] sponsors = [(x["Type"], self.get_member(x["MemberId"])) for x in sponsors] for typ, sponsor in sponsors: name = "{First} {Last}".format(**dict(sponsor["Name"])) bill.add_sponsorship( name, entity_type="person", classification="primary" if "Author" in typ else "secondary", primary="Author" in typ, ) for version in instrument["Versions"]["DocumentDescription"]: name, url, doc_id, version_id = [ version[x] for x in ["Description", "Url", "Id", "Version"] ] link = bill.add_version_link(name, url, media_type="application/pdf") link["extras"] = { "_internal_document_id": doc_id, "_version_id": version_id, } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source( SOURCE_URL.format(**{ "session": session, "bid": guid })) yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data['ORIGINAL_SPONSOR']: bill.add_sponsorship( name=primary_sponsor, entity_type='organization' if "committee" in primary_sponsor.lower() else 'person', primary=True, classification="original sponsor" ) for sponsor in bill_data['SPONSOR_NAMES']: if sponsor in bill_data['ORIGINAL_SPONSOR']: continue bill.add_sponsorship( name=sponsor, entity_type='organization' if "committee" in sponsor.lower() else 'person', primary=False, classification='cosponsor', ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = event['session_date'] # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action( action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and ( date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view': self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'].strip(), media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # If this Boolean field is True, then do not scrape the Bill. # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API: # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826 if matter['MatterRestrictViewViaWeb']: continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data['ORIGINAL_SPONSOR']: bill.add_sponsorship(name=primary_sponsor, entity_type='organization' if "committee" in primary_sponsor.lower() else 'person', primary=True, classification="original sponsor") for sponsor in bill_data['SPONSOR_NAMES']: if sponsor in bill_data['ORIGINAL_SPONSOR']: continue bill.add_sponsorship( name=sponsor, entity_type='organization' if "committee" in sponsor.lower() else 'person', primary=False, classification='cosponsor', ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = event['session_date'] # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action(action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation( created_after=datetime.datetime(2015, 5, 17)): title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date']: continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session( self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) bill.add_source(leg_summary['url']) try: leg_details = self.legDetails(leg_summary['url']) except IndexError: unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []): lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill( identifier=related_bill['label'], legislative_session=bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])): if i == 0: primary = True sponsorship_type = "Primary" else: primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')): sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm', )): sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith( ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')): bill.add_sponsorship( sponsor_name, sponsorship_type, entity_type, primary, entity_id=_make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details: for subject in leg_details[u'Topic'].split(','): bill.add_subject(subject) for attachment in leg_details.get('Attachments', []): if attachment['label']: bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']): action_description = action['Action'] try: action_date = self.toTime( action['Date']).date().isoformat() except AttributeError: # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description: try: responsible_org = action['Action\xa0By']['label'] except TypeError: responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'Chicago City Council' act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[ action_description]) if action_description == 'Referred': try: leg_details[ 'Current Controlling Legislative Body'][ 'label'] controlling_bodies = [ leg_details[ 'Current Controlling Legislative Body'] ] except TypeError: controlling_bodies = leg_details[ 'Current Controlling Legislative Body'] if controlling_bodies: for controlling_body in controlling_bodies: body_name = controlling_body['label'] if body_name.startswith("Joint Committee"): act.add_related_entity( body_name, 'organization') else: act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id( name=body_name)) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result: # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification': leg_summary['Type']} yield bill print(unreachable_urls)
def scrape_chamber(self, chamber, session): chamber_name = "Senate" if chamber == "upper" else "House" chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + "bill_status/").text bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: bill_id = bill_data["BILLNO"] # filter other chambers if not bill_id.startswith(chamber_letter): continue if "CR" in bill_id: btype = "concurrent resolution" elif "R" in bill_id: btype = "resolution" elif "B" in bill_id: btype = "bill" title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"] # main bill = Bill(bill_id, session, title, chamber=chamber, classification=btype) bill.extras = {"status": bill_data["STATUS"]} bill.add_source(ksapi.url + "bill_status/" + bill_id.lower()) if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title: bill.add_title(bill_data["LONGTITLE"]) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]: bill.add_sponsorship( name=primary_sponsor, entity_type="organization" if "committee" in primary_sponsor.lower() else "person", primary=True, classification="original sponsor", ) for sponsor in bill_data["SPONSOR_NAMES"]: if sponsor in bill_data["ORIGINAL_SPONSOR"]: continue bill.add_sponsorship( name=sponsor, entity_type="organization" if "committee" in sponsor.lower() else "person", primary=False, classification="cosponsor", ) # history is backwards for event in reversed(bill_data["HISTORY"]): actor = "upper" if event["chamber"] == "Senate" else "lower" date = event["session_date"] # append committee names if present if "committee_names" in event: action = (event["status"] + " " + " and ".join(event["committee_names"])) else: action = event["status"] if event["action_code"] not in ksapi.action_codes: self.warning( "unknown action code on %s: %s %s" % (bill_id, event["action_code"], event["status"])) atype = None else: atype = ksapi.action_codes[event["action_code"]] bill.add_action(action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] if not all((date, title)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] bill = Bill(identifier=matter['MatterFile'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id=_make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []): bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize( date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format( start.year, start.month, start.day) org_name = event['meeting'] e = Event(name=org_name, start_time=start, timezone=tz.zone, location_name=event['location'], status=STATUS_DICT.get(event['meeting_status'])) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name=org_name, type='organization', ) def is_agenda_available(event): return event['publishing_status'] in [ 'Agenda Published', 'Minutes Published' ] def is_council(event): return True if event[ 'meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council( event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list( self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def is_vote_event(item): return True if item['type'] == 'ACTION' else False def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') def is_being_introduced(item, event): org_name = event['meeting'] identifier = item['identifier'] # `org_code` is two-letter code for committee current_org_code = self.committees_by_name.get( org_name)[0]['code'] originating_org_code = re.search( r'([A-Z]{2})[0-9]+\.[0-9]+', identifier).group(1) return current_org_code == originating_org_code if is_vote_event(item): wards = normalize_wards(item['wards']) identifier_regex = re.compile( r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [ id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier'] ] a.add_bill(full_identifier) if is_being_introduced(item, event): b = Bill( # TODO: Fix this hardcode legislative_session='2014-2018', identifier=full_identifier, title=item['title'], from_organization={'name': org_name}, ) b.add_source(agenda_url) b.add_document_link( note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format( full_identifier)) b.extras = { 'wards': wards, } yield b yield e