def scrape(self): unreachable_urls = [] for leg_summary in self.legislation( created_after=datetime.datetime(2015, 5, 17)): title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date']: continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session( self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) bill.add_source(leg_summary['url']) try: leg_details = self.legDetails(leg_summary['url']) except IndexError: unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []): lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill( identifier=related_bill['label'], legislative_session=bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])): if i == 0: primary = True sponsorship_type = "Primary" else: primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')): sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm', )): sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith( ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')): bill.add_sponsorship( sponsor_name, sponsorship_type, entity_type, primary, entity_id=make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details: for subject in leg_details[u'Topic'].split(','): bill.add_subject(subject) for attachment in leg_details.get('Attachments', []): if attachment['label']: bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']): action_description = action['Action'] try: action_date = self.toTime( action['Date']).date().isoformat() except AttributeError: # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description: try: responsible_org = action['Action\xa0By']['label'] except TypeError: responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'Chicago City Council' act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[ action_description]) if action_description == 'Referred': try: leg_details[ 'Current Controlling Legislative Body'][ 'label'] controlling_bodies = [ leg_details[ 'Current Controlling Legislative Body'] ] except TypeError: controlling_bodies = leg_details[ 'Current Controlling Legislative Body'] if controlling_bodies: for controlling_body in controlling_bodies: body_name = controlling_body['label'] if body_name.startswith("Joint Committee"): act.add_related_entity( body_name, 'organization') else: act.add_related_entity( body_name, 'organization', entity_id=make_pseudo_id( name=body_name)) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result: # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote yield bill print(unreachable_urls)
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data['ORIGINAL_SPONSOR']: bill.add_sponsorship(name=primary_sponsor, entity_type='organization' if "committee" in primary_sponsor.lower() else 'person', primary=True, classification="original sponsor") for sponsor in bill_data['SPONSOR_NAMES']: if sponsor in bill_data['ORIGINAL_SPONSOR']: continue bill.add_sponsorship( name=sponsor, entity_type='organization' if "committee" in sponsor.lower() else 'person', primary=False, classification='cosponsor', ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = event['session_date'] # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action(action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML try: yield from self.scrape_html(bill, session) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # Otherwise, try second year of the session biennium if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[-4:], bill_id.replace(' ', '-')) html = self.get(url).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute('http://legislature.mi.gov') title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( 'primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor' ) else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE) if submatch and tds[2].xpath('a'): version_url = tds[2].xpath('a/@href')[0] version_name = tds[2].xpath('a/text()')[0].strip() version_name = 'Substitute {}'.format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' elif version_url.lower().endswith('.htm'): mimetype = 'text/html' bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) if results is not None: vote_passed = len(results['yes']) > len(results['no']) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if vote_passed else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search(r'YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search(r'NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def parse_bill_status_page(self, status_url, bill_url, session, chamber): status_page = lxml.html.fromstring(self.get(status_url).text) # see 2007 HB 2... weird. bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf' bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href' bill_id = re.search(bill_re, status_page.xpath(bill_xpath)[0], re.IGNORECASE).groups() bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1])) try: xp = '//b[text()="Short Title:"]/../following-sibling::td/text()' title = status_page.xpath(xp).pop() except IndexError: title = status_page.xpath('//tr[1]/td[2]')[0].text_content() # Add bill type. _bill_id = bill_id.lower() if 'b' in _bill_id: classification = 'bill' elif 'j' in _bill_id or 'jr' in _bill_id: classification = 'joint resolution' elif 'cr' in _bill_id: classification = 'concurrent resolution' elif 'r' in _bill_id: classification = 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) self.add_actions(bill, status_page) votes = self.add_votes(bill, status_page, status_url) tabledata = self._get_tabledata(status_page) # Add sponsor info. bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary', entity_type='person', primary=True) # A various plus fields MT provides. plus_fields = [ 'requester', ('chapter number:', 'chapter'), 'transmittal date:', 'drafter', 'fiscal note probable:', 'bill draft number:', 'preintroduction required:', 'by request of', 'category:' ] for x in plus_fields: if isinstance(x, tuple): _key, key = x else: _key = key = x key = key.replace(' ', '_') try: val = tabledata[_key] except KeyError: continue if len(val) == 1: val = val[0] bill.extras[key] = val # Add bill subjects. xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr' subjects = [] for tr in status_page.xpath(xp): try: subj = tr.xpath('td')[0].text_content() except: continue subjects.append(subj) for s in subjects: bill.add_subject(s) self.add_fiscal_notes(status_page, bill) return bill, list(votes)
def parse_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) last_action = self.parse_bill_field(page, 'Last Action').xpath('text()')[0] if 'WITHDRAWN' in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, 'Bill Documents') source_url = version.xpath('a[1]/@href')[0] version_title = version.xpath('a[1]/text()')[0].strip() if version is None: # Bill withdrawn self.logger.warning('Bill withdrawn.') return else: if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' title = self.parse_bill_field(page, 'Title').text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship(link.text.strip(), classification='primary', entity_type='person', primary=True) bdr_no = self.parse_bill_field(page, 'Bill Request Number') if bdr_no.xpath('text()'): bdr = bdr_no.xpath('text()')[0].strip() bill.extras["BDR"] = bdr yield bill
def old_scrape(self, session=None): status_report_url = "https://www.legislature.ohio.gov/legislation/status-reports" # ssl verification off due Ohio not correctly implementing SSL if not session: session = self.latest_session() self.info('no session, using %s', session) doc = self.get(status_report_url).text doc = lxml.html.fromstring(doc) doc.make_links_absolute(status_report_url) xpath = "//div[contains(text(),'{}')]/following-sibling::table" status_table = doc.xpath(xpath.format(session))[0] status_links = status_table.xpath( ".//a[contains(text(),'Excel')]/@href") for url in status_links: try: fname, resp = self.urlretrieve(url) except scrapelib.HTTPError as report: self.logger.warning("Missing report {}".format(report)) continue sh = xlrd.open_workbook(fname).sheet_by_index(0) # once workbook is open, we can remove tempfile os.remove(fname) for rownum in range(1, sh.nrows): bill_id = sh.cell(rownum, 0).value bill_type = "resolution" if "R" in bill_id else "bill" chamber = "lower" if "H" in bill_id else "upper" bill_title = str(sh.cell(rownum, 3).value) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) bill.add_source(url) bill.add_sponsor('primary', str(sh.cell(rownum, 1).value)) # add cosponsor if sh.cell(rownum, 2).value: bill.add_sponsor('cosponsor', str(sh.cell(rownum, 2).value)) actor = "" # Actions start column after bill title for colnum in range(4, sh.ncols - 1): action = str(sh.cell(0, colnum).value) cell = sh.cell(rownum, colnum) date = cell.value if len(action) != 0: if action.split()[0] == 'House': actor = "lower" elif action.split()[0] == 'Senate': actor = "upper" elif action.split()[-1] == 'Governor': actor = "executive" elif action.split()[0] == 'Gov.': actor = "executive" elif action.split()[-1] == 'Gov.': actor = "executive" if action in ('House Intro. Date', 'Senate Intro. Date'): atype = ['bill:introduced'] action = action.replace('Intro. Date', 'Introduced') elif action == '3rd Consideration': atype = ['bill:reading:3', 'bill:passed'] elif action == 'Sent to Gov.': atype = ['governor:received'] elif action == 'Signed By Governor': atype = ['governor:signed'] else: atype = ['other'] if type(date) == float: date = str(xlrd.xldate_as_tuple(date, 0)) date = datetime.datetime.strptime( date, "(%Y, %m, %d, %H, %M, %S)") date = self._tz.localize(date) date = "{:%Y-%m-%d}".format(date) bill.add_action(actor, action, date, type=atype) for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue underscore_bill = bill_id[:idx] + "_" + bill_id[idx:] break yield from self.scrape_votes_old(bill, underscore_bill, session) self.scrape_versions_old(bill, underscore_bill, session) yield bill
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution', 9: 'petition'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \ 'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) root.make_links_absolute("http://www.leg.state.nv.us/") bill_id = root.xpath('string(/html/body/div[@id="content"]' '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = list(set(self.subject_mapping[bill_id])) billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext() text_urls = billtext.xpath("./a") for text_url in text_urls: version_name = text_url.text.strip() version_url = text_url.attrib['href'] bill.add_version_link(note=version_name, url=version_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(classification='primary', name=leg, entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(classification='cosponsor', name=leg, entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count += 1 self.scrape_actions(root, bill, "lower") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill
def scrape_matter(self, matter_link, sess): matter_types = { "Additions": "other", "Administrative Order": "order", "Annual Evaluation": "other", "Bid Advertisement": "other", "Bid Awards": "other", "Bid Contract": "contract", "Bid Protest": "other", "Bid Rejection": "other", "Birthday Scroll": "commemoration", "Certificate of Appreciation": "commemoration", "Change Order": "order", "Citizen's Presentation": "other", "Commendation": "commemoration", "Conflict Waiver": "other", "Congratulatory Certificate": "commemoration", "Deferrals": "other", "Discussion Item": "other", "Distinguished Visitor": "other", "Joint Meeting/Workshop": "other", "Mayoral Veto": "other", "Miscellaneous": "other", "Nomination": "nomination", "Oath of Office": "other", "Omnibus Reserve": "bill", "Ordinance": "ordinance", "Plaque": "commemoration", "Presentation": "other", "Proclamation": "proclamation", "Professional Service Agreement": "contract", "Public Hearing": "other", "Report": "other", "Request for Proposals": "other", "Request for Qualifications": "other", "Request to Advertise": "other", "Resolution": "resolution", "Resolution of Sympathy": "resolution", "Service Awards": "commemoration", "Special Item": "other", "Special Presentation": "other", "Supplement": "other", "Swearing-In": "other", "Time Sensitive Items": "other", "Withdrawals": "other", "Workshop Item": "other", "Zoning": "other", "Zoning Resolution": "resolution" } matter_doc = self.lxmlize(matter_link) info_dict = self.matter_table_to_dict(matter_doc) #we're going to use the year of the intro date as the session #until/unless we come up with something better intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y") session = sess["identifier"] category = matter_types[info_dict["File Type"]] if 'File Name' in info_dict: title = info_dict["File Name"] elif "Title" in info_dict and info_dict["Title"].strip(): title = info_dict["Title"].strip() else: self.warning("bill has no title") return if category == 'other': bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title) else: bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title, classification=category) for spons in info_dict["Sponsors"]: if spons == "NONE": continue try: name, spons_type = spons.rsplit(",", 1) except ValueError: name = spons spons_type = "Sponsor" primary = True if "Prime Sponsor" in spons_type else False entity = "person" if "committee" in name: entity = committee bill.add_sponsorship(name, spons_type, entity, primary) if "Indexes" in info_dict: for subj in info_dict["Indexes"]: if subj.strip() and subj.strip() != "NONE": bill.add_subject(subj.strip()) if "Title" in info_dict and info_dict["Title"].strip(): note = "bill's long title'" if ("Note" in info_dict and info_dict["Note"].strip()): note = info_dict["Note"] bill.add_abstract(abstract=info_dict["Title"], note=note) self.process_action_table(matter_doc, bill) bill.add_source(matter_link, note='web') yield bill
def scrape(self, window=30): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) self.retry_wait_seconds = 20 for matter in self.matters(n_days_ago): matter_id = matter["MatterId"] date = matter["MatterIntroDate"] title = matter["MatterTitle"] identifier = matter["MatterFile"] # If a bill has a duplicate action item that"s causing the entire scrape # to fail, add it to the `problem_bills` array to skip it. # For the time being...nothing to skip! problem_bills = [] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) if matter["MatterTypeName"] in BILL_TYPES: ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]] else: ocd_bill_type = None if identifier.startswith("S"): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=ocd_bill_type, from_organization={"name": "Pittsburgh City Council"}) legistar_web = matter["legistar_url"] legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id) bill.add_source(legistar_web, note="web") bill.add_source(legistar_api, note="api") for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop("responsible person") act = bill.add_action(**action) if responsible_person: act.add_related_entity(responsible_person, "person", entity_id=_make_pseudo_id(name=responsible_person)) if action["description"] == "Referred": body_name = matter["MatterBodyName"] if body_name != "City Council": act.add_related_entity(body_name, "organization", entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action["description"], organization=action["organization"], classification=None, start_date=action["date"], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + "/histories") for vote in votes: raw_option = vote["VoteValueName"].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote["VotePersonName"].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic["MatterIndexName"].strip()) for attachment in self.attachments(matter_id): if attachment["MatterAttachmentName"]: bill.add_version_link(attachment["MatterAttachmentName"], attachment["MatterAttachmentHyperlink"], media_type="application/pdf") bill.extras = {"local_classification": matter["MatterTypeName"]} text = self.text(matter_id) if text: if text["MatterTextPlain"]: bill.extras["plain_text"] = text["MatterTextPlain"] if text["MatterTextRtf"]: bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "") yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') return bill
def scrape_bill(self, chamber, session, session_id, bill_id, url): sidebar = lxml.html.fromstring(self.get(url).text) sidebar.make_links_absolute("https://www.legis.iowa.gov") hist_url = (f'https://www.legis.iowa.gov/legislation/billTracking/' f'billHistory?billName={bill_id}&ga={session_id}') req_session = requests.Session() req = requests.get(hist_url) if (req.status_code == 500): self.warning("500 error on {}, skipping".format(hist_url)) return page = lxml.html.fromstring(req.text) page.make_links_absolute("https://www.legis.iowa.gov") title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])').strip() if title == '': # Sometimes the title is moved, see # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88 title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])').strip() if title == '': self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if 'HR' in bill_id or 'SR' in bill_id: bill_type = ['resolution'] elif 'HJR' in bill_id or 'SJR' in bill_id: bill_type = ['joint resolution'] elif 'HCR' in bill_id or 'SCR' in bill_id: bill_type = ['concurrent resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(hist_url) # base url for text version (version_abbrev, session_id, bill_id) version_html_url_template = 'https://www.legis.iowa.gov/docs/'\ 'publications/LG{}/{}/attachments/{}.html' version_pdf_url_template = 'https://www.legis.iowa.gov/docs/'\ 'publications/LG{}/{}/{}.pdf' # get pieces of version_link vpieces = sidebar.xpath('//select[@id="billVersions"]/option') if vpieces: for version in vpieces: version_name = version.text version_abbrev = version.xpath('string(@value)') # Get HTML document of bill version. version_html_url = version_html_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(' ', '')) bill.add_version_link(note=version_name, url=version_html_url, media_type='text/html') # Get PDF document of bill version. version_pdf_url = version_pdf_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(' ', '')) bill.add_version_link(note=version_name, url=version_pdf_url, media_type='application/pdf') sponsors_str = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])').strip() if re.search('^By ', sponsors_str): sponsors = re.split(',| and ', sponsors_str.split('By ')[1]) # for some bills sponsors listed in different format else: sponsors = re.findall(r'[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)', sponsors_str) for sponsor in sponsors: sponsor = sponsor.replace(' and', '').strip(' .,') # a few sponsors get mangled by our regex sponsor = { 'Means': 'Ways & Means', 'Iowa': 'Economic Growth/Rebuild Iowa', 'Safety': 'Public Safety', 'Resources': 'Human Resources', 'Affairs': 'Veterans Affairs', 'Protection': 'Environmental Protection', 'Government': 'State Government', 'Boef': 'De Boef' }.get(sponsor, sponsor) if sponsor[0].islower(): # SSBs catch cruft in it ('charges', 'overpayments') # https://sunlight.atlassian.net/browse/DATA-286 continue bill.add_sponsorship(name=sponsor, classification='primary', entity_type='person', primary=True) for tr in page.xpath( "//table[contains(@class, 'billActionTable')][1]/tbody/tr"): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[3])").strip() action = re.sub(r'\s+', ' ', action) # Capture any amendment links. links = [ link for link in [version['links'] for version in bill.versions] ] version_urls = [ link['url'] for link in [i for sub in links for i in sub] ] if 'amendment' in action.lower(): for anchor in tr.xpath('.//a[1]'): if '-' in anchor.text: # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf amd_pattern = 'https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf' amd_id = anchor.text.replace('-', '').strip() amd_url = amd_pattern.format(session_id, amd_id) amd_name = 'Amendment {}'.format(anchor.text.strip()) if amd_url not in version_urls: bill.add_version_link(note=amd_name, url=amd_url, media_type='application/pdf') version_urls.append(amd_url) else: self.info( "Already Added {}, skipping".format(amd_url)) if 'S.J.' in action or 'SCS' in action: actor = 'upper' elif 'H.J.' in action or 'HCS' in action: actor = 'lower' else: actor = "legislature" action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip() if action.startswith('Introduced'): atype = ['introduction'] if ', referred to' in action: atype.append('referral-committee') elif action.startswith('Read first time'): atype = 'reading-1' elif action.startswith('Referred to'): atype = 'referral-committee' elif action.startswith('Sent to Governor'): atype = 'executive-receipt' elif action.startswith('Reported Signed by Governor'): atype = 'executive-signature' elif action.startswith('Signed by Governor'): atype = 'executive-signature' elif action.startswith('Vetoed by Governor'): atype = 'executive-veto' elif action.startswith('Item veto'): atype = 'executive-veto-line-item' elif re.match(r'Passed (House|Senate)', action): atype = 'passage' elif re.match(r'Amendment (S|H)-\d+ filed', action): atype = ['amendment-introduction'] if ', adopted' in action: atype.append('amendment-passage') elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted', action): atype = 'amendment-passage' elif re.match(r'Amendment (S|N)-\d+ lost', action): atype = 'amendment-failure' elif action.startswith('Resolution filed'): atype = 'introduction' elif action.startswith('Resolution adopted'): atype = 'passage' elif (action.startswith('Committee report') and action.endswith('passage.')): atype = 'committee-passage' elif action.startswith('Withdrawn'): atype = 'withdrawal' else: atype = None if action.strip() == "": continue if re.search(r'END OF \d+ ACTIONS', action): continue if '$history' not in action: bill.add_action(description=action, date=date, chamber=actor, classification=atype) self.scrape_subjects(bill, bill_id, session, req_session) yield bill
def scrape(self, window=28): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in self.matters(n_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self, chamber, session): zip_url = zip_urls[session] fname, resp = self.urlretrieve(zip_url) self.zf = zipfile.ZipFile(open(fname)) os.remove(fname) # bill basics self.bills = {} # LSR->Bill self.bills_by_id = {} # need a second table to attach votes last_line = [] for line in self.zf.open('tbllsrs.txt').readlines(): line = line.split('|') if len(line) < 36: if len(last_line + line[1:]) == 36: # combine two lines for processing # (skip an empty entry at beginning of second line) line = last_line + line self.warning('used bad line') else: # skip this line, maybe we'll use it later self.warning('bad line: %s' % '|'.join(line)) last_line = line continue session_yr = line[0] lsr = line[1] title = line[2] body = line[3] expanded_bill_id = line[9] bill_id = line[10] if body == body_code[chamber] and session_yr == session: if expanded_bill_id.startswith('CACR'): bill_type = 'constitutional amendment' elif expanded_bill_id.startswith('PET'): bill_type = 'petition' elif expanded_bill_id.startswith('AR') and bill_id.startswith( 'CACR'): bill_type = 'constitutional amendment' else: bill_type = bill_type_map[expanded_bill_id.split(' ')[0] [1:]] if title.startswith('('): title = title.split(')', 1)[1].strip() self.bills[lsr] = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type) version_url = VERSION_URL % (session, expanded_bill_id.replace(' ', '')) self.bills[lsr].add_version_link(note='latest version', url=version_url, media_type='text/html') self.bills_by_id[bill_id] = self.bills[lsr] # load legislators self.legislators = {} for line in self.zf.open('tbllegislators.txt').readlines(): line = line.split('|') employee_num = line[0] # first, last, middle if line[3]: name = '%s %s %s' % (line[2], line[3], line[1]) else: name = '%s %s' % (line[2], line[1]) self.legislators[employee_num] = {'name': name, 'seat': line[5]} # body = line[4] # sponsors for line in self.zf.open('tbllsrsponsors.txt').readlines(): session_yr, lsr, seq, employee, primary = line.strip().split('|') if session_yr == session and lsr in self.bills: sp_type = 'primary' if primary == '1' else 'cosponsor' try: self.bills[lsr].add_sponsorship( classification=sp_type, name=self.legislators[employee]['name'], entity_type='person', primary=True if sp_type == 'primary' else False) self.bills[lsr].extras = { '_code': self.legislators[employee]['seat'] } except KeyError: self.warning("Error, can't find person %s" % employee) # actions for line in self.zf.open('tbldocket.txt').readlines(): # a few blank/irregular lines, irritating if '|' not in line: continue (session_yr, lsr, _, timestamp, bill_id, body, action, _) = line.split('|') if session_yr == session and lsr in self.bills: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %H:%M:%S %p') action = action.strip() atype = classify_action(action) self.bills[lsr].add_action(chamber=actor, description=action, date=time.strftime("%Y-%m-%d"), classification=atype) amendment_id = extract_amendment_id(action) if amendment_id: self.bills[lsr].add_document_link( note='amendment %s' % amendment_id, url=AMENDMENT_URL % amendment_id) yield from self.scrape_votes(session, zip_url) # save all bills for bill in self.bills.values(): bill.add_source(zip_url) yield bill
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in self.matters(n_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop('responsible person') act = bill.add_action(**action) if responsible_person: act.add_related_entity( responsible_person, 'person', entity_id=_make_pseudo_id(name=responsible_person)) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[:4], bill_id.replace(" ", "-"), ) html = self.get(url).text # Otherwise, try second year of the session biennium if ("Page Not Found" in html or "The bill you are looking for is not available yet" in html): url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[-4:], bill_id.replace(" ", "-"), ) html = self.get(url).text if ("Page Not Found" in html or "The bill you are looking for is not available yet" in html): self.warning( "Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute("http://legislature.mi.gov") title = doc.xpath( '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(" ")[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u"\xa0", " ") # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ("primary" if sponsor.tail and "primary" in sponsor.tail else "cosponsor") else: classification = "primary" bill.add_sponsorship( name=name.strip(), chamber=chamber, entity_type="person", primary=classification == "primary", classification=classification, ) bill.subject = doc.xpath( '//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath( '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath("td") # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize( datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = "upper" if "SJ" in journal else "lower" classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE) if submatch and tds[2].xpath("a"): version_url = tds[2].xpath("a/@href")[0] version_name = tds[2].xpath("a/text()")[0].strip() version_name = "Substitute {}".format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith(".pdf"): mimetype = "application/pdf" elif version_url.lower().endswith(".htm"): mimetype = "text/html" bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath("a/@href") if journal_link: objectname = journal_link[0].rsplit("=", 1)[-1] chamber_name = {"upper": "Senate", "lower": "House"}[actor] vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % ( session, chamber_name, objectname, ) results = self.parse_roll_call(vote_url, rc_num, session) if results is not None: vote_passed = len(results["yes"]) > len(results["no"]) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result="pass" if vote_passed else "fail", classification="passage", ) # check the expected counts vs actual count = re.search(r"YEAS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["yes"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["yes"]))) count = re.search(r"NAYS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["no"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["no"]))) vote.set_count("yes", len(results["yes"])) vote.set_count("no", len(results["no"])) vote.set_count("other", len(results["other"])) possible_vote_results = ["yes", "no", "other"] for pvr in possible_vote_results: for name in results[pvr]: if session == "2017-2018": names = name.split("\t") for n in names: vote.vote(pvr, name.strip()) else: # Prevents voter names like "House Bill No. 4451, entitled" and other sentences if len(name.split()) < 5: vote.vote(pvr, name.strip()) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath( '//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith(".pdf"): mimetype = "application/pdf" elif url.endswith(".htm"): mimetype = "text/html" bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = session[5:] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber']) ) # Create the bill using its basic information bill = Bill( identifier=info['BillNumber'], legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary') ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a' ) for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8') ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format(info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.items()} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'executive-signature' elif actor == 'lower' and any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action( description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), chamber=actor, classification=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif "Failed -- " in vote['FullStatus']: did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, bill=info['BillNumber'], bill_chamber=bill_chamber ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = { "Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper" } # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = { "approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False } action_dict = { "ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format( session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source( first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): number_link, _ga, title, primary_sponsor, status = row.xpath( 'td') bill_id = number_link.text_content() title = title.text_content().strip() chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ( 'http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', ''))) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url + version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship(sponsor_name, classification='primary', entity_type='person', primary=True) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url + bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning( "Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize( datetime.datetime.strptime(action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url + bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime( data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action(effective_action, effective_date, chamber="executive", classification=["became-law"]) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url + bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url + bill_version["disapprove"][0][ "link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError( "Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data['summary'], classification=bill_type, ) if bill_data['summary']: bill.add_abstract(bill_data['summary'], note='') bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor'] is not None: if bill_data['sponsor']['rules'] is True: bill.add_sponsorship( 'Rules Committee', entity_type='organization', classification='primary', primary=True, ) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsorship( primary_sponsor['shortName'], entity_type='person', classification='primary', primary=True, ) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor['shortName'], entity_type='person', classification='cosponsor', primary=False, ) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill( companion_bill_id, companion_bill_session, relation_type='companion', ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime( action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action['text']) bill.add_action( action['text'], action_date.strftime('%Y-%m-%d'), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources[ 'bill'].format( session_year=session, bill_id=bill_id, summary='', detail='') bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: yield self._parse_senate_votes(vote_data, bill, api_url) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.items(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version_link( html_version, html_url, on_duplicate='ignore', media_type='text/html', ) pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version_link( pdf_version, pdf_url, on_duplicate='ignore', media_type='application/pdf', ) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype ) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link( link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore' ) sponsor_links = page.xpath( "//td[contains(@id, 'tdSponsors')]/a") for link in sponsor_links: bill.add_sponsorship( link.text, classification='primary', primary=True, entity_type='person' ) actor = chamber use_row = False self.debug(bill_id) for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"): if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') elif action.startswith('Signed by Governor'): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match('\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if ".B. " in bill_id: bill_type = "bill" elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "): bill_type = "resolution" elif ".C.R. " in bill_id: bill_type = "concurrent resolution" elif ".J.R. " in bill_id: bill_type = "joint resolution" for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub(r"\s+", " ", bill_id).strip() bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: try: (title, name) = [ x.strip() for x in info.xpath(".//text()") if x.strip() ] except ValueError: self.warning( "Could not find sponsor's name for {}".format(bill_id)) continue assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification="primary", entity_type="person", primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship( floor_sponsor, classification="cosponsor", entity_type="person", primary=False, ) else: raise AssertionError("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]') for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get("href") if not url: url = version.xpath("following-sibling::a[1]/@href")[0] bill.add_version_link(version.xpath("text()")[0].strip(), url, media_type="application/pdf") for related in page.xpath( '//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath("@href")[0] if ".fn.pdf" in href: bill.add_document_link("Fiscal Note", href, media_type="application/pdf") else: text = related.xpath("text()")[0] bill.add_document_link(text, href, media_type="application/pdf") subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects if page.xpath('//div[@id="billStatus"]//table'): status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \ 'HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count += 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]' + '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type ) bill.subject = list(set(self.subject_mapping[bill_id])) for table in root.xpath('//div[@id="content"]/table'): if 'Bill Text' in table.text_content(): bill_text = table.xpath("string(tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version_link(note="Bill Text", url=text_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(name=leg, classification='primary', entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(name=leg, classification='cosponsor', entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" # bill.add_document(minutes_date, minutes_url) bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill
def bill_info(self, bill_link, session, main_url): bill_page = self.lxmlize(bill_link) long_title = self.get_node( bill_page, '//div[@class="main-content"]//h2').text.split() bill_number = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] if not title: self.error('no title, skipping %s', bill_number) return bill_type = 'resolution' if 'LR' in bill_number else 'bill' bill = Bill(bill_number, session, title, classification=bill_type) bill.add_source(main_url) bill.add_source(bill_link) introduced_by = self.get_node( bill_page, '//body/div[3]/div[2]/div[2]/div/div[3]/div[1]/ul/li[1]/a[1]/text()' ) if not introduced_by: introduced_by = self.get_node( bill_page, '//body/div[3]/div[2]/div[2]/div/div[2]/div[1]/ul/li[1]/text()' ) introduced_by = introduced_by.split('Introduced By:')[1].strip() introduced_by = introduced_by.strip() bill.add_sponsorship( name=introduced_by, entity_type='person', primary=True, classification='primary', ) action_nodes = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr') for action_node in action_nodes: date = self.get_node(action_node, './td[1]').text date = datetime.strptime(date, '%b %d, %Y') # The action node may have an anchor element within it, so # we grab all the text within. action = self.get_node(action_node, './td[2]').text_content() if 'Governor' in action: actor = 'executive' elif 'Speaker' in action: actor = 'legislature' else: actor = 'legislature' action_type = self.action_types(action) bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=action_type, ) # Were in reverse chronological order. bill.actions.reverse() # Grabs bill version documents. version_links = self.get_nodes( bill_page, '/html/body/div[3]/div[2]/div[2]/div/' 'div[3]/div[2]/ul/li/a') for version_link in version_links: version_name = version_link.text version_url = version_link.attrib['href'] # replace Current w/ session number version_url = version_url.replace('Current', session) bill.add_version_link(version_name, version_url, media_type='application/pdf') # Adds any documents related to amendments. amendment_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a') for amendment_link in amendment_links: amendment_name = amendment_link.text amendment_url = amendment_link.attrib['href'] bill.add_document_link(amendment_name, amendment_url) self.scrape_amendments(bill, bill_page) # Related transcripts. transcript_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/' 'div[@class="hidden-xs"]/table/tr/td/a') for transcript_link in transcript_links: transcript_name = transcript_link.text transcript_url = transcript_link.attrib['href'] bill.add_document_link(transcript_name, transcript_url) yield bill yield from self.scrape_votes(bill, bill_page, actor)
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "http://legislature.idaho.gov/legislation/%s/" % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + "/" + session[0:4], "%m/%d/%Y").strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u"\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=year, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) yield bill
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, "Bill Documents") source_url = version.xpath("a[1]/@href")[0] version_title = version.xpath("a[1]/text()")[0].strip() if version is None: # Bill withdrawn self.logger.warning("Bill withdrawn.") return else: if source_url.endswith(".doc"): mimetype = "application/msword" elif source_url.endswith(".pdf"): mimetype = "application/pdf" title = self.parse_bill_field(page, "Title").text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] if source_url.endswith(".doc"): mimetype = "application/msword" elif source_url.endswith(".pdf"): mimetype = "application/pdf" bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath( "//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip() == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[4:]) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title )) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=session, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % ( self._house_base_url, doc_tag[0].attrib['href'] ) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text"" and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[contains(text(),"Bill Text")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]') for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]') # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath('//div[contains(text(),"Amendment")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip() path = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip() summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath('.//img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = "https://malegislature.gov/Bills/{}/{}".format( session_for_url, bill_id ) try: response = self.get(bill_url) self.info("GET (with `requests`) - {}".format(bill_url)) except requests.exceptions.RequestException: self.warning(u"Server Error on {}".format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning(u"Server Error on {}".format(bill_url)) return False # The state website will periodically miss a few bills' titles for a few days # These titles will be extant on the bill list page, but missing on the bill detail page # The titles are eventually populated try: bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[ 0 ] except IndexError: self.warning("Couldn't find title for {}; skipping".format(bill_id)) return False bill_types = ["H", "HD", "S", "SD", "SRes"] if re.sub("[0-9]", "", bill_id) not in bill_types: self.warning("Unsupported bill type for {}; skipping".format(bill_id)) return False if "SRes" in bill_id: bill_id = bill_id.replace("SRes", "SR") bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification="bill", ) bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, "summary") bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' "following-sibling::dd/descendant-or-self::*/text()[normalize-space()]" ) if sponsor: sponsor = sponsor[0].strip() bill.add_sponsorship( sponsor, classification="primary", primary=True, entity_type="person" ) self.scrape_cosponsors(bill, bill_url) version = page.xpath( "//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href" ) if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link( "Bill Text", version_url, media_type="application/pdf" ) # yield back votes and bill # XXX yield from self.scrape_actions(bill, bill_url, session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) self._bill_prefix_map = { "HB": { "type": "bill", "url_segment": "bills/house" }, "HR": { "type": "resolution", "url_segment": "resolutions/house/simple" }, "HCR": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJR": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "HC": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJ": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "SB": { "type": "bill", "url_segment": "bills/senate" }, "SR": { "type": "resolution", "url_segment": "resolutions/senate/simple" }, "SCR": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJR": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, "SC": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJ": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, } api_base_url = "https://api.iga.in.gov" proxy = {"url": "http://in-proxy.openstates.org"} # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # sunlight's put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue disp_bill_id = bill_id[:idx] + " " + str(int(bill_id[idx:])) break bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue title = bill_json["description"] if title == "NoneNone": title = None # sometimes description is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning( "Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ("lower" if bill_json["originChamber"].lower() == "house" else "upper") bill_type = self._bill_prefix_map[bill_prefix]["type"] bill = Bill( disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type, ) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: bill.add_sponsorship( classification="author", name=self._get_name(s), entity_type="person", primary=True, ) for s in bill_json["coauthors"]: bill.add_sponsorship( classification="coauthor", name=self._get_name(s), entity_type="person", primary=False, ) for s in bill_json["sponsors"]: bill.add_sponsorship( classification="sponsor", name=self._get_name(s), entity_type="person", primary=True, ) for s in bill_json["cosponsors"]: bill.add_sponsorship( classification="cosponsor", name=self._get_name(s), entity_type="person", primary=False, ) # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get("bill_actions", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace("T", " ") # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if "second reading" in d or "reread second time" in d: action_type.append("reading-2") reading = True if "third reading" in d or "reread third time" in d: action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ("referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning( "Could not recognize an action in '{}'".format( action_desc)) action_type = None a = bill.add_action( chamber=action_chamber, description=action_desc, date=date, classification=action_type, ) if committee: a.add_related_entity(committee, entity_type="organization") # subjects subjects = [ s["entry"] for s in bill_json["latestVersion"]["subjects"] ] for subject in subjects: bill.add_subject(subject) # Abstract if bill_json["latestVersion"]["digest"]: bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest") # versions and votes for version in bill_json["versions"][::-1]: try: version_json = client.get( "bill_version", session=session, bill_id=version["billName"], version_id=version["printVersionName"], ) except scrapelib.HTTPError: self.logger.warning("Bill version does not seem to exist.") continue yield from self.deal_with_version(version_json, bill, bill_id, original_chamber, session, proxy) yield bill
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Check if bill hasn't been transmitted to the other chamber yet transmit_check = self.get_node( doc, '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()' ) if (transmit_check is not None and 'has not been transmitted' in transmit_check.strip()): self.logger.debug('Bill has not been transmitted to other chamber ' '... skipping {0}'.format(bill_detail_url)) return # Get the basic parts of the bill bill_id = self.get_node(doc, '//h1/text()') self.logger.debug(bill_id) bill_title_text = self.get_node( doc, '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()' ) if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node( doc, '//a[text()[contains(.,"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node( long_desc_page, '//h1/' 'following-sibling::p/text()') if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = 'No title found.' self.logger.warning('No title found for {}.'.format(bill_id)) self.logger.debug(bill_title) bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) # Add source bill.add_source(bill_detail_url) for subject in self._subject_mapping[bill_id]: bill.add_subject(subject) # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]' '/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()