def prepare_for_db(self, data): data['legislative_session_id'] = self.get_session_id( data.pop('legislative_session')) data['organization_id'] = self.org_importer.resolve_json_id( data.pop('organization')) bill = data.pop('bill') if bill and bill.startswith('~'): # unpack psuedo id and apply filter in case there are any that alter it bill = get_pseudo_id(bill) self.bill_importer.apply_transformers(bill) bill = _make_pseudo_id(**bill) data['bill_id'] = self.bill_importer.resolve_json_id(bill) bill_action = data.pop('bill_action') if bill_action: try: action = BillAction.objects.get( bill_id=data['bill_id'], description=bill_action, date=data['start_date'], organization_id=data['organization_id'], ) # seen_action_ids is for ones being added in this import # action.vote is already set if action was set on prior import if action.id in self.seen_action_ids or hasattr( action, 'vote'): self.warning('can not match two VoteEvents to %s: %s', action.id, bill_action) else: data['bill_action_id'] = action.id self.seen_action_ids.add(action.id) except BillAction.DoesNotExist: self.warning('could not match VoteEvent to %s %s %s', bill, bill_action, data['start_date']) except BillAction.MultipleObjectsReturned as e: self.warning('could not match VoteEvent to %s %s %s: %s', bill, bill_action, data['start_date'], e) for vote in data['votes']: vote['voter_id'] = self.person_importer.resolve_json_id( vote['voter_id'], allow_no_match=True) return data
def prepare_for_db(self, data): data['jurisdiction_id'] = self.jurisdiction_id data['location'] = self.get_location(data['location']) data['start_date'] = data['start_date'] data['end_date'] = data.get('end_date', "") for participant in data['participants']: if 'person_id' in participant: participant['person_id'] = self.person_importer.resolve_json_id( participant['person_id'], allow_no_match=True) elif 'organization_id' in participant: participant['organization_id'] = self.org_importer.resolve_json_id( participant['organization_id'], allow_no_match=True) for item in data['agenda']: for entity in item['related_entities']: if 'person_id' in entity: entity['person_id'] = self.person_importer.resolve_json_id( entity['person_id'], allow_no_match=True) elif 'organization_id' in entity: entity['organization_id'] = self.org_importer.resolve_json_id( entity['organization_id'], allow_no_match=True) elif 'bill_id' in entity: # unpack and repack bill psuedo id in case filters alter it bill = get_pseudo_id(entity['bill_id']) self.bill_importer.apply_transformers(bill) bill = _make_pseudo_id(**bill) entity['bill_id'] = self.bill_importer.resolve_json_id( bill, allow_no_match=True) elif 'vote_event_id' in entity: entity['vote_event_id'] = self.vote_event_importer.resolve_json_id( entity['vote_event_id'], allow_no_match=True) return data
def prepare_for_db(self, data): data['jurisdiction_id'] = self.jurisdiction_id if data['location']: data['location'] = self.get_location(data['location']) data['start_date'] = data['start_date'] data['end_date'] = data.get('end_date', "") for participant in data['participants']: if 'person_id' in participant: participant[ 'person_id'] = self.person_importer.resolve_json_id( participant['person_id'], allow_no_match=True) elif 'organization_id' in participant: participant[ 'organization_id'] = self.org_importer.resolve_json_id( participant['organization_id'], allow_no_match=True) for item in data['agenda']: for entity in item['related_entities']: if 'person_id' in entity: entity['person_id'] = self.person_importer.resolve_json_id( entity['person_id'], allow_no_match=True) elif 'organization_id' in entity: entity[ 'organization_id'] = self.org_importer.resolve_json_id( entity['organization_id'], allow_no_match=True) elif 'bill_id' in entity: # unpack and repack bill psuedo id in case filters alter it bill = get_pseudo_id(entity['bill_id']) self.bill_importer.apply_transformers(bill) bill = _make_pseudo_id(**bill) entity['bill_id'] = self.bill_importer.resolve_json_id( bill, allow_no_match=True) elif 'vote_event_id' in entity: entity[ 'vote_event_id'] = self.vote_event_importer.resolve_json_id( entity['vote_event_id'], allow_no_match=True) return data
def prepare_for_db(self, data): data['legislative_session_id'] = self.get_session_id(data.pop('legislative_session')) data['organization_id'] = self.org_importer.resolve_json_id(data.pop('organization')) bill = data.pop('bill') if bill and bill.startswith('~'): # unpack psuedo id and apply filter in case there are any that alter it bill = get_pseudo_id(bill) self.bill_importer.apply_transformers(bill) bill = _make_pseudo_id(**bill) data['bill_id'] = self.bill_importer.resolve_json_id(bill) bill_action = data.pop('bill_action') if bill_action: try: action = BillAction.objects.get(bill_id=data['bill_id'], description=bill_action, date=data['start_date'], organization_id=data['organization_id'], ) # seen_action_ids is for ones being added in this import # action.vote is already set if action was set on prior import if action.id in self.seen_action_ids or hasattr(action, 'vote'): self.warning('can not match two VoteEvents to %s: %s', action.id, bill_action) else: data['bill_action_id'] = action.id self.seen_action_ids.add(action.id) except BillAction.DoesNotExist: self.warning('could not match VoteEvent to %s %s %s', bill, bill_action, data['start_date']) except BillAction.MultipleObjectsReturned as e: self.warning('could not match VoteEvent to %s %s %s: %s', bill, bill_action, data['start_date'], e) for vote in data['votes']: vote['voter_id'] = self.person_importer.resolve_json_id(vote['voter_id'], allow_no_match=True) return data
def prepare_for_db(self, data): data['legislative_session_id'] = self.get_session_id( data.pop('legislative_session')) data['organization_id'] = self.org_importer.resolve_json_id( data.pop('organization')) bill = data.pop('bill') if bill and bill.startswith('~'): bill = get_pseudo_id(bill) bill['identifier'] = fix_bill_id(bill['identifier']) bill = _make_pseudo_id(**bill) data['bill_id'] = self.bill_importer.resolve_json_id(bill) bill_action = data.pop('bill_action') if bill_action: try: action = BillAction.objects.get( bill_id=data['bill_id'], description=bill_action, date=data['start_date'], organization_id=data['organization_id'], ) if action.id in self.seen_action_ids: self.warning('can not match two VoteEvents to %s: %s', action.id, bill_action) else: data['bill_action_id'] = action.id self.seen_action_ids.add(action.id) except BillAction.DoesNotExist: self.warning('could not match VoteEvent to %s %s %s', bill, bill_action, data['start_date']) except BillAction.MultipleObjectsReturned as e: self.warning('could not match VoteEvent to %s %s %s: %s', bill, bill_action, data['start_date'], e) for vote in data['votes']: vote['voter_id'] = self.person_importer.resolve_json_id( vote['voter_id'], allow_no_match=True) return data
def prepare_for_db(self, data): data['jurisdiction_id'] = self.jurisdiction_id data['location'] = self.get_location(data['location']) data['start_date'] = data['start_date'] data['end_date'] = data.get('end_date', "") for participant in data['participants']: if 'person_id' in participant: participant[ 'person_id'] = self.person_importer.resolve_json_id( participant['person_id'], allow_no_match=True) elif 'organization_id' in participant: participant[ 'organization_id'] = self.org_importer.resolve_json_id( participant['organization_id'], allow_no_match=True) for item in data['agenda']: for entity in item['related_entities']: if 'person_id' in entity: entity['person_id'] = self.person_importer.resolve_json_id( entity['person_id'], allow_no_match=True) elif 'organization_id' in entity: entity[ 'organization_id'] = self.org_importer.resolve_json_id( entity['organization_id'], allow_no_match=True) elif 'bill_id' in entity: bill = get_pseudo_id(entity['bill_id']) bill['identifier'] = fix_bill_id(bill['identifier']) bill = _make_pseudo_id(**bill) entity['bill_id'] = self.bill_importer.resolve_json_id( bill, allow_no_match=True) elif 'vote_event_id' in entity: entity[ 'vote_event_id'] = self.vote_event_importer.resolve_json_id( entity['vote_event_id'], allow_no_match=True) return data
def scrape(self, start_time=None): if start_time is None: start_time = datetime.datetime(2017, 1, 1, 0, 0, tzinfo=pytz.utc) dupes = {} uniq = {} bad_ids = [] for i, hearing in enumerate(self.congressional_hearings(start_time)): package_id = hearing['packageId'] try: package_num, = re.findall('\d+$', package_id) except ValueError: bad_ids.append(package_id) continue # For appropriations hearings, the committees tend to # publish portions of the hearings as they are completed, # and then the final hearing are usually compiled, # printed, and added to the repository at the request of # the Committee. # # packages with 8 digits after hrg are the in-process # version # # There could be some time between the in-process and # final packages. Publication of hearings is the purview # of the committee. # # https://github.com/usgpo/api/issues/21#issuecomment-435926223 if len(package_num) == 8: continue mods_link = hearing['download']['modsLink'] response = self.get(mods_link) mods = xmltodict.parse(response.content) extension = collections.ChainMap(*mods['mods']['extension']) granule_class = extension.get('granuleClass', 'boo') if granule_class == 'ERRATA': continue meeting_type = self._meeting_type(extension) if meeting_type is None: continue held_date = extension['heldDate'] if type(held_date) is list: start_date = min(held_date) else: start_date = held_date event = Event(name=self._title(mods), start_date=start_date, classification=meeting_type, location_name='unknown') if not event.name: continue if 'number' in extension: hearing_number = '{docClass} {congress}-{number}'.format( **extension) print(hearing_number) event.extras['hearing_number'] = hearing_number for committee_d in self._unique(extension.get('congCommittee', [])): names = committee_d['name'] committee_name = self._name_type(names, 'authority-standard') if committee_name is None: committee_name = self._name_type(names, 'authority-short') if committee_d['@chamber'] == 'H': committee_name = 'House ' + committee_name elif committee_d['@chamber'] == 'S': committee_name = 'Senate ' + committee_name try: thomas_id = committee_d['@authorityId'].upper() except KeyError: thomas_id = None sub_committees = self._subcommittees(committee_d) if sub_committees: for sub_committee_d in sub_committees: sub_committee_name = sub_committee_d['name']['#text'] sub_committee_name = sub_committee_name.strip( string.punctuation) sub_committee_id = _make_pseudo_id( name=sub_committee_name, parent__identifiers__identifier=thomas_id) ret = { "name": sub_committee_name, "entity_type": 'organization', "note": 'host', "organization_id": sub_committee_id, } event.participants.append(ret) else: if thomas_id: ret = { "name": committee_name, "entity_type": 'organization', "note": 'host', "organization_id": _make_pseudo_id(identifiers__identifier=thomas_id) } event.participants.append(ret) else: event.add_committee(committee_name, note='host') links = mods['mods']['location']['url'] for link in self._unique(links): if link['@displayLabel'] == 'Content Detail': event.add_source(link['#text'], note='web') elif link['@displayLabel'] == 'HTML rendition': event.add_document('transcript', link['#text'], media_type='text/html') elif link['@displayLabel'] == 'PDF rendition': event.add_document('transcript', link['#text'], media_type='application/pdf') event.add_source(mods_link, note='API') self._unique_event(uniq, event, dupes) self._house_docs(uniq) for event in uniq.values(): yield event with open('bad_ids.txt', 'w') as f: for id in bad_ids: f.write(id + '\n')
def _house_docs(self, uniq): _house_docs = {} house_scraper = HouseCommittee( cache_storage=self.cache_storage, requests_per_minute=self.requests_per_minute) for link, hearing_xml in house_scraper.scrape(): meeting_title, = hearing_xml.xpath('//meeting-title/text()') start_date, = hearing_xml.xpath( '//meeting-date/calendar-date/text()') try: room, = hearing_xml.xpath('//room/text()') except ValueError: location = 'unknown' else: location = '{} {}'.format( hearing_xml.xpath('//building/text()')[0], room) meeting_title = meeting_title.upper() event = uniq.get((meeting_title.upper(), start_date)) if event is not None: event.location = { "name": location, "note": "", "coordinates": None } event.add_source(link, note='docs.house.gov XML') self._add_house_docs(event, hearing_xml) else: event = Event(name=meeting_title[:1000], start_date=start_date, location_name=location) event.add_source(link, note='docs.house.gov XML') for sub_committee in hearing_xml.xpath( '//subcommittees/committee-name'): name, = sub_committee.xpath('.//text()') thomas_id = sub_committee.attrib['parent-id'] participant = { "name": name, "entity_type": 'organization', "note": 'host', "organization_id": _make_pseudo_id( name=name, parent__identifiers__identifier=thomas_id), } event.participants.append(participant) for committee in hearing_xml.xpath( '//committees/committee-name'): name, = committee.xpath('.//text()') thomas_id = committee.attrib['id'] participant = { "name": name, "entity_type": 'organization', "note": 'host', "organization_id": _make_pseudo_id(identifiers__identifier=thomas_id), } event.participants.append(participant) self._add_house_docs(event, hearing_xml) uniq[(meeting_title, start_date)] = event
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # If this Boolean field is True, then do not scrape the Bill. # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API: # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826 if matter['MatterRestrictViewViaWeb']: continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self, window=28) : n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for matter in self.matters(n_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link('Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in self.matters(n_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] # There are currently no bills with duplicate action items! If a # bill has a duplicate action item that's causing the entire scrape # to fail, add it to the `problem_bills` array to skip it. problem_bills = [] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = matter['legistar_url'] legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop('responsible person') act = bill.add_action(**action) if responsible_person: act.add_related_entity( responsible_person, 'person', entity_id=_make_pseudo_id(name=responsible_person)) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in self.matters(n_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] # Temporarily, we should not scrape or import these bills: # https://chicago.legistar.com/LegislationDetail.aspx?ID=3291304&GUID=72ACF5FE-0803-46E8-90B4-604119803293 # They have duplicate action items, which cause the entire scrape # to fail. The Chicago clerk's office should fix it in the near # future, after which we can remove this code. problem_bills = ['CL2017-1281'] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = matter['legistar_url'] legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop('responsible person') act = bill.add_action(**action) if responsible_person: act.add_related_entity( responsible_person, 'person', entity_id=_make_pseudo_id(name=responsible_person)) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def extract_actions(self, bill, doc, current_chamber): """ Extract the actions taken on a bill. A bill can have actions taken from either chamber. The current chamber's actions will be the first table of actions. The other chamber's actions will be in the second table. """ bill_actions = list() action_tables = doc.xpath('//table[@class="actions"]') for cur_table in action_tables: for row in cur_table.xpath('.//tr'): bill_action = dict() # Split up columns date_col, the_rest = row.xpath('td') # The second column can hold a link to full text # and pages (what should be in another column), # but also links to committee elements or other spanned # content. action_date = date_col.text_content().strip() action_text = the_rest.text.strip() committee = the_rest.xpath("a[contains(@href,'committee')]/text()") extra = ''.join(the_rest.xpath('span[not(@style)]/text() | a/text()')) # skip non-actions (don't have date) if action_text in ('Chapter number', 'See also', 'See', 'Effective date', 'Secretary of State'): continue # dates are really inconsistent here, sometimes in action_text try: action_date = datetime.datetime.strptime( action_date, '%m/%d/%Y').date() except ValueError: try: action_date = datetime.datetime.strptime( extra, '%m/%d/%y').date() except ValueError: try: action_date = datetime.datetime.strptime( extra, '%m/%d/%Y').date() except ValueError: self.warning('ACTION without date: %s' % action_text) continue # categorize actions action_type = None for pattern, atype in self._categorizers: if re.match(pattern, action_text): action_type = atype if 'referral-committee' in action_type and len(committee) > 0: bill_action['committees'] = committee[0] break if extra: action_text += ' ' + extra bill_action['action_text'] = action_text if isinstance(action_type, list): for atype in action_type: if atype is not None and atype.startswith('governor'): bill_action['action_chamber'] = 'executive' break else: bill_action['action_chamber'] = current_chamber else: if (action_type is not None and action_type.startswith('governor')): bill_action['action_chamber'] = 'executive' else: bill_action['action_chamber'] = current_chamber bill_action['action_date'] = action_date bill_action['action_type'] = action_type bill_actions.append(bill_action) # Try to extract vote # bill = self.extract_vote_from_action(bill, bill_action, current_chamber, row) # if there's a second table, toggle the current chamber if current_chamber == 'upper': current_chamber = 'lower' else: current_chamber = 'upper' # Add acctions to bill for action in bill_actions: act = bill.add_action(action['action_text'], action['action_date'], chamber=action['action_chamber'], classification=action['action_type']) if 'committees' in action: committee = action['committees'] act.add_related_entity( committee, 'organization', entity_id=_make_pseudo_id(name=committee)) return bill
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation( created_after=datetime.datetime(2015, 5, 17)): title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date']: continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session( self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) bill.add_source(leg_summary['url']) try: leg_details = self.legDetails(leg_summary['url']) except IndexError: unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []): lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill( identifier=related_bill['label'], legislative_session=bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])): if i == 0: primary = True sponsorship_type = "Primary" else: primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')): sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm', )): sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith( ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')): bill.add_sponsorship( sponsor_name, sponsorship_type, entity_type, primary, entity_id=_make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details: for subject in leg_details[u'Topic'].split(','): bill.add_subject(subject) for attachment in leg_details.get('Attachments', []): if attachment['label']: bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']): action_description = action['Action'] try: action_date = self.toTime( action['Date']).date().isoformat() except AttributeError: # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description: try: responsible_org = action['Action\xa0By']['label'] except TypeError: responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'Chicago City Council' act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[ action_description]) if action_description == 'Referred': try: leg_details[ 'Current Controlling Legislative Body'][ 'label'] controlling_bodies = [ leg_details[ 'Current Controlling Legislative Body'] ] except TypeError: controlling_bodies = leg_details[ 'Current Controlling Legislative Body'] if controlling_bodies: for controlling_body in controlling_bodies: body_name = controlling_body['label'] if body_name.startswith("Joint Committee"): act.add_related_entity( body_name, 'organization') else: act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id( name=body_name)) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result: # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification': leg_summary['Type']} yield bill print(unreachable_urls)
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] if not all((date, title)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] bill = Bill(identifier=matter['MatterFile'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self): for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) : leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name":"New York City Council"}) bill.add_source(leg_summary['url'], note='web') leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details : bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number'] : bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) : sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary) for attachment in leg_details.get('Attachments', []) : bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history : earliest_action = min(self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else : bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history : action_description = action['Action'] if not action_description : continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'New York City Council' elif responsible_org == 'Administration' : responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting' : continue else : act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral' : action_details = self.actionDetails(action_detail_url) referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity(referred_committee, 'organization', entity_id = _make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if result and votes : action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url, note='web') for option, voter in votes : action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text : bill.extras = {'local_classification' : leg_summary['Type'], 'full_text' : text} else : bill.extras = {'local_classification' : leg_summary['Type']} yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and ( date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view': self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'].strip(), media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self) : three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] if body_name != 'City Council' : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_version_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) : title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date'] : continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) bill.add_source(leg_summary['url']) try : leg_details = self.legDetails(leg_summary['url']) except IndexError : unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []) : lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')) : sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm',)) : sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith(('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')) : bill.add_sponsorship(sponsor_name, sponsorship_type, entity_type, primary, entity_id = _make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details : for subject in leg_details[u'Topic'].split(',') : bill.add_subject(subject) for attachment in leg_details.get('Attachments', []) : if attachment['label'] : bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']) : action_description = action['Action'] try : action_date = self.toTime(action['Date']).date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : try : responsible_org = action['Action\xa0By']['label'] except TypeError : responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'Chicago City Council' act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[action_description]) if action_description == 'Referred' : try : leg_details['Current Controlling Legislative Body']['label'] controlling_bodies = [leg_details['Current Controlling Legislative Body']] except TypeError : controlling_bodies = leg_details['Current Controlling Legislative Body'] if controlling_bodies : for controlling_body in controlling_bodies : body_name = controlling_body['label'] if body_name.startswith("Joint Committee") : act.add_related_entity(body_name, 'organization') else : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification' : leg_summary['Type']} yield bill print(unreachable_urls)
def scrape(self, window=30): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) self.retry_wait_seconds = 20 for matter in self.matters(n_days_ago): matter_id = matter["MatterId"] date = matter["MatterIntroDate"] title = matter["MatterTitle"] identifier = matter["MatterFile"] # If a bill has a duplicate action item that"s causing the entire scrape # to fail, add it to the `problem_bills` array to skip it. # For the time being...nothing to skip! problem_bills = [] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) if matter["MatterTypeName"] in BILL_TYPES: ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]] else: ocd_bill_type = None if identifier.startswith("S"): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=ocd_bill_type, from_organization={"name": "Pittsburgh City Council"}) legistar_web = matter["legistar_url"] legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id) bill.add_source(legistar_web, note="web") bill.add_source(legistar_api, note="api") for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop("responsible person") act = bill.add_action(**action) if responsible_person: act.add_related_entity(responsible_person, "person", entity_id=_make_pseudo_id(name=responsible_person)) if action["description"] == "Referred": body_name = matter["MatterBodyName"] if body_name != "City Council": act.add_related_entity(body_name, "organization", entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action["description"], organization=action["organization"], classification=None, start_date=action["date"], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + "/histories") for vote in votes: raw_option = vote["VoteValueName"].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote["VotePersonName"].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic["MatterIndexName"].strip()) for attachment in self.attachments(matter_id): if attachment["MatterAttachmentName"]: bill.add_version_link(attachment["MatterAttachmentName"], attachment["MatterAttachmentHyperlink"], media_type="application/pdf") bill.extras = {"local_classification": matter["MatterTypeName"]} text = self.text(matter_id) if text: if text["MatterTextPlain"]: bill.extras["plain_text"] = text["MatterTextPlain"] if text["MatterTextRtf"]: bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "") yield bill
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id=_make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []): bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def extract_actions(self, bill, doc, current_chamber): """ Extract the actions taken on a bill. A bill can have actions taken from either chamber. The current chamber's actions will be the first table of actions. The other chamber's actions will be in the second table. """ bill_actions = list() action_tables = doc.xpath('//table[@class="actions"]') for cur_table in action_tables: for row in cur_table.xpath('.//tr'): bill_action = dict() # Split up columns date_col, the_rest = row.xpath('td') # The second column can hold a link to full text # and pages (what should be in another column), # but also links to committee elements or other spanned # content. action_date = date_col.text_content().strip() action_text = the_rest.text.strip() committee = the_rest.xpath( "a[contains(@href,'committee')]/text()") extra = ''.join( the_rest.xpath('span[not(@style)]/text() | a/text()')) # skip non-actions (don't have date) if action_text in ('Chapter number', 'See also', 'See', 'Effective date', 'Secretary of State'): continue # dates are really inconsistent here, sometimes in action_text try: action_date = datetime.datetime.strptime( action_date, '%m/%d/%Y').date() except ValueError: try: action_date = datetime.datetime.strptime( extra, '%m/%d/%y').date() except ValueError: try: action_date = datetime.datetime.strptime( extra, '%m/%d/%Y').date() except ValueError: self.warning('ACTION without date: %s' % action_text) continue # categorize actions action_type = None for pattern, atype in self._categorizers: if re.match(pattern, action_text): action_type = atype if 'referral-committee' in action_type and len( committee) > 0: bill_action['committees'] = committee[0] break if extra: action_text += ' ' + extra bill_action['action_text'] = action_text if isinstance(action_type, list): for atype in action_type: if atype is not None and atype.startswith('governor'): bill_action['action_chamber'] = 'executive' break else: bill_action['action_chamber'] = current_chamber else: if (action_type is not None and action_type.startswith('governor')): bill_action['action_chamber'] = 'executive' else: bill_action['action_chamber'] = current_chamber bill_action['action_date'] = action_date bill_action['action_type'] = action_type bill_actions.append(bill_action) # Try to extract vote # bill = self.extract_vote_from_action(bill, bill_action, current_chamber, row) # if there's a second table, toggle the current chamber if current_chamber == 'upper': current_chamber = 'lower' else: current_chamber = 'upper' # Add acctions to bill for action in bill_actions: act = bill.add_action(action['action_text'], action['action_date'], chamber=action['action_chamber'], classification=action['action_type']) if 'committees' in action: committee = action['committees'] act.add_related_entity( committee, 'organization', entity_id=_make_pseudo_id(name=committee)) return bill