def scrape_bill(self, row, chamber, session): bill_id = row['LegislationNumber'] # TODO: re-evaluate if these should be separate bills if 'SA' in bill_id or 'HA' in bill_id: self.warning('skipping amendment %s', bill_id) return bill_type = self.classify_bill(bill_id) bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a' additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a' cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' # on_duplicate='error' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape_table(self, page): for tr in page.xpath('//table[@id="ContentPlaceHolder1_GR1"]/tr')[2:]: print(lxml.etree.tostring(tr)) print("\n") year = tr.xpath('td[1]/text()')[0] bill_no = tr.xpath('td[2]/text()')[0] title = tr.xpath('td[3]/a[1]/text()')[0].strip() versions = tr.xpath('td[3]/a')[1:] bill = Bill(identifier=bill_no, legislative_session=year, title=title, classification="resolution") for version in versions: url = version.xpath('@href')[0] mime = mimetypes.guess_type(url)[0] bill.add_version_link(version.text_content().strip(), url, media_type=mime) if tr.xpath('string(td[6])'): intro_date_str = tr.xpath('string(td[6])') regex = re.compile( r'\s+(?P<date>\d{2}/\d{2}/\d{4})\s+\((?P<chamber>Lok Sabha|Rajya Sabha\))' ) for match in regex.finditer(intro_date_str): print(match.groupdict()) if match.group('chamber') == 'Lok Sabha': action_chamber = 'lower' elif match.group('chamber') == 'Rajya Sabha': action_chamber = 'upper' intro_date = datetime.strptime(match.group('date'), '%d/%m/%Y') intro_date = self.tz.localize(intro_date) bill.add_action( "Introduced in {}".format(match.group( 'chamber')), # Action description, from the state intro_date, chamber=action_chamber, classification='introduction') if tr.xpath('string(td[7])'): passed_lower = tr.xpath('string(td[7])') regex = r'\s+(?P<date>\d{2}/\d{2}/\d{4})\s+' print("PL Date:") print(passed_lower) if tr.xpath('string(td[8])'): passed_upper = tr.xpath('string(td[8])') regex = r'\s+(?P<date>\d{2}/\d{2}/\d{4})\s+' print("PU Date:") print(passed_upper) bill.add_source(self.base_url) #print( tr.xpath('td[3]/a[3]/text()')) print(bill) yield bill
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id) try: response = requests.get(bill_url) except requests.exceptions.RequestException as e: self.warning(u'Server Error on {}'.format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning(u'Server Error on {}'.format(bill_url)) return False bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0] bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification='bill') bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' 'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]') if sponsor: sponsor = sponsor[0].strip() bill.add_sponsorship(sponsor, classification='primary', primary=True, entity_type='person') self.scrape_cosponsors(bill, bill_url) version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link('Bill Text', version_url, media_type='application/pdf') # yield back votes and bill yield from self.scrape_actions(bill, bill_url, session) yield bill
def scrape_bills(self, session): session_key = SESSION_KEYS[session] measures_response = self.api_client.get('measures', page=500, session=session_key) legislators = index_legislators(self, session_key) for measure in measures_response: bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber']) chamber = self.chamber_code[bid[0]] bill = Bill( bid.replace(' ', ''), legislative_session=session, chamber=chamber, title=measure['RelatingTo'], classification=self.bill_types[measure['MeasurePrefix'][1:]] ) bill.add_abstract(measure['MeasureSummary'].strip(), note='summary') for sponsor in measure['MeasureSponsors']: legislator_code = sponsor['LegislatoreCode'] # typo in API if legislator_code: try: legislator = legislators[legislator_code] except KeyError: logger.warn('Legislator {} not found in session {}'.format( legislator_code, session)) legislator = legislator_code bill.add_sponsorship( name=legislator, classification={'Chief': 'primary', 'Regular': 'cosponsor'}[ sponsor['SponsorLevel']], entity_type='person', primary=True if sponsor['SponsorLevel'] == 'Chief' else False ) bill.add_source( "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format( session=session_key, bid=bid.replace(' ', '')) ) for document in measure['MeasureDocuments']: # TODO: probably mixing documents & versions here - should revisit try: bill.add_version_link(document['VersionDescription'], document['DocumentUrl'], media_type='application/pdf') except ValueError: logger.warn('Duplicate link found for {}'.format(document['DocumentUrl'])) for action in measure['MeasureHistoryActions']: classifiers = self.determine_action_classifiers(action['ActionText']) when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S') when = self.tz.localize(when) bill.add_action(action['ActionText'], when, chamber=self.chamber_code[action['Chamber']], classification=classifiers) yield bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter="|") for row in page: bill_chamber = {"H": "lower", "S": "upper"}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2) bill_type = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", "MR": "memorial", "CMR": "concurrent memorial", }[type_spec] if row[-1] != self.slug: continue bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type, ) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship( primary, classification="primary", entity_type="person", primary=True, ) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % (self.slug, bill_id.replace(" ", ""))) bill.add_version_link(bill_id, version_url, media_type="application/pdf") yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bills(self, session): session_key = SESSION_KEYS[session] measures_response = self.api_client.get('measures', page=500, session=session_key) legislators = index_legislators(self, session_key) for measure in measures_response: bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber']) chamber = self.chamber_code[bid[0]] bill = Bill( bid.replace(' ', ''), legislative_session=session, chamber=chamber, title=measure['RelatingTo'], classification=self.bill_types[measure['MeasurePrefix'][1:]] ) bill.add_abstract(measure['MeasureSummary'].strip(), note='summary') for sponsor in measure['MeasureSponsors']: legislator_code = sponsor['LegislatoreCode'] # typo in API if legislator_code: try: legislator = legislators[legislator_code] except KeyError: logger.warn('Legislator {} not found in session {}'.format( legislator_code, session)) legislator = legislator_code bill.add_sponsorship( name=legislator, classification={'Chief': 'primary', 'Regular': 'cosponsor'}[ sponsor['SponsorLevel']], entity_type='person', primary=True if sponsor['SponsorLevel'] == 'Chief' else False ) bill.add_source( "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format( session=session_key, bid=bid.replace(' ', '')) ) for document in measure['MeasureDocuments']: # TODO: probably mixing documents & versions here - should revisit try: bill.add_version_link(document['VersionDescription'], document['DocumentUrl'], media_type='application/pdf') except ValueError: logger.warn('Duplicate link found for {}'.format(document['DocumentUrl'])) for action in measure['MeasureHistoryActions']: classifiers = self.determine_action_classifiers(action['ActionText']) when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S') when = self.tz.localize(when) bill.add_action(action['ActionText'], when, chamber=self.chamber_code[action['Chamber']], classification=classifiers) yield bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] xml_chamber = xpath(page, 'string(wa:OriginalAgency)') chamber = self._chamber_map[xml_chamber] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.get(url).text page = unicode_csv_reader(StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial' }[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) # ftp://www.arkleg.state.ar.us/Bills/ # TODO: Keep on eye on this post 2017 to see if they apply R going forward. session_code = '2017R' if session == '2017' else session version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % (session_code, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial' }[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % (self.slug, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()): for legislation_summary in self.parseSearchResults(page): title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary( legislation_summary) for related_bill in legislation_details.get( 'Related files', []): bill.add_related_bill(name=related_bill, session=self.session, relation='other-session', chamber=None) for i, sponsor in enumerate( legislation_details.get('Sponsors', [])): if i == 0: primary = True sponsorship_type = "Primary" else: primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []): bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []): bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.get(url).text page = unicode_csv_reader(StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial'}[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) # ftp://www.arkleg.state.ar.us/Bills/ # TODO: Keep on eye on this post 2017 to see if they apply R going forward. session_code = '2017R' if session == '2017' else session version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session_code, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()) : for legislation_summary in self.parseSearchResults(page) : title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary(legislation_summary) for related_bill in legislation_details.get('Related files', []) : bill.add_related_bill(name = related_bill, session = self.session, relation='other-session', chamber=None) for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []) : bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []) : bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def scrape_chamber(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath( "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type) yield from self.scrape_digest(bill, chamber) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version_link(a.text, a.get('href'), media_type='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document_link('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document_link('Summary', summary[0].get('href')) bill.add_source(url) yield bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial'}[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % ( self.slug, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_chamber(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type) yield from self.scrape_digest(bill, chamber) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version_link(a.text, a.get('href'), media_type='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document_link('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document_link('Summary', summary[0].get('href')) bill.add_source(url) yield bill
def bill_info(self, bill_link, session, main_url): bill_page = self.lxmlize(bill_link) long_title = self.get_node( bill_page, '//div[@class="main-content"]/div[1]/div/h2').text.split() bill_number = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] if not title: self.error('no title, skipping %s', bill_number) return bill_type = 'resolution' if 'LR' in bill_number else 'bill' bill = Bill(bill_number, session, title, classification=bill_type) bill.add_source(main_url) bill.add_source(bill_link) introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()') if not introduced_by: introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()') introduced_by = introduced_by.split('Introduced By:')[1].strip() bill.add_sponsorship( name=introduced_by, entity_type='person', primary=True, classification='primary', ) action_nodes = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr') for action_node in action_nodes: date = self.get_node( action_node, './td[1]').text date = datetime.strptime(date, '%b %d, %Y') # The action node may have an anchor element within it, so # we grab all the text within. action = self.get_node( action_node, './td[2]').text_content() if 'Governor' in action: actor = 'executive' elif 'Speaker' in action: actor = 'legislature' else: actor = 'legislature' action_type = self.action_types(action) bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=action_type, ) # Were in reverse chronological order. bill.actions.reverse() # Grabs bill version documents. version_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[3]/div[2]/' 'div[@class="hidden-xs"]/ul[1]/li/a') for version_link in version_links: version_name = version_link.text version_url = version_link.attrib['href'] # replace Current w/ session number version_url = version_url.replace('Current', session) bill.add_version_link(version_name, version_url, media_type='application/pdf') # Adds any documents related to amendments. amendment_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a') for amendment_link in amendment_links: amendment_name = amendment_link.text amendment_url = amendment_link.attrib['href'] bill.add_document_link(amendment_name, amendment_url) # Related transcripts. transcript_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/' 'div[@class="hidden-xs"]/table/tr/td/a') for transcript_link in transcript_links: transcript_name = transcript_link.text transcript_url = transcript_link.attrib['href'] bill.add_document_link(transcript_name, transcript_url) yield bill yield from self.scrape_votes(bill, bill_page, actor)
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link(link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore') sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + '/following-sibling::div[1]/p/a') for link in sponsor_links: if link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislators/'): sponsor_type = 'person' elif link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislative_Session/Committees' ): sponsor_type = 'organization' else: raise ScrapeError('Found unexpected sponsor, URL: ' + link.attrib['href']) bill.add_sponsorship(link.text, classification='primary', primary=True, entity_type=sponsor_type) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == '': self.debug( 'Skipping action table row that is completely empty') continue if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') if re.match(r'Signed by (?:the\s)*Governor', action, re.IGNORECASE): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match(r'\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def bill_info(self, bill_link, session, main_url): bill_page = self.lxmlize(bill_link) long_title = self.get_node( bill_page, '//div[@class="main-content"]/div[1]/div/h2').text.split() bill_number = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] if not title: self.error('no title, skipping %s', bill_number) return bill_type = 'resolution' if 'LR' in bill_number else 'bill' bill = Bill(bill_number, session, title, classification=bill_type) bill.add_source(main_url) bill.add_source(bill_link) introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()') if not introduced_by: introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()') introduced_by = introduced_by.split('Introduced By:')[1].strip() bill.add_sponsorship( name=introduced_by, entity_type='person', primary=True, classification='primary', ) action_nodes = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr') for action_node in action_nodes: date = self.get_node( action_node, './td[1]').text date = datetime.strptime(date, '%b %d, %Y') # The action node may have an anchor element within it, so # we grab all the text within. action = self.get_node( action_node, './td[2]').text_content() if 'Governor' in action: actor = 'executive' elif 'Speaker' in action: actor = 'legislature' else: actor = 'legislature' action_type = self.action_types(action) bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=action_type, ) # Were in reverse chronological order. bill.actions.reverse() # Grabs bill version documents. version_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[3]/div[2]/' 'div[@class="hidden-xs"]/ul[1]/li/a') for version_link in version_links: version_name = version_link.text version_url = version_link.attrib['href'] # replace Current w/ session number version_url = version_url.replace('Current', session) bill.add_version_link(version_name, version_url, media_type='application/pdf') # Adds any documents related to amendments. amendment_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a') for amendment_link in amendment_links: amendment_name = amendment_link.text amendment_url = amendment_link.attrib['href'] bill.add_document_link(amendment_name, amendment_url) # Related transcripts. transcript_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/' 'div[@class="hidden-xs"]/table/tr/td/a') for transcript_link in transcript_links: transcript_name = transcript_link.text transcript_url = transcript_link.attrib['href'] bill.add_document_link(transcript_name, transcript_url) return bill
def scrape_details(self, bill_detail_url, session, chamber, bill_id): """ Create the Bill and add the information obtained from the provided bill_detail_url. and then yield the bill object. :param bill_detail_url: :param session: :param chamber: :param bill_id: :return: """ page = self.get(bill_detail_url).text if 'INVALID BILL NUMBER' in page: self.warning('INVALID BILL %s' % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath('span/text()')[0] if 'General Bill' in bill_type: bill_type = 'bill' elif 'Concurrent Resolution' in bill_type: bill_type = 'concurrent resolution' elif 'Joint Resolution' in bill_type: bill_type = 'joint resolution' elif 'Resolution' in bill_type: bill_type = 'resolution' else: raise ValueError('unknown bill type: %s' % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill( bill_id, legislative_session= session, # session name metadata's `legislative_sessions` chamber=chamber, # 'upper' or 'lower' title=bill_summary, classification=bill_type) subjects = list(self._subjects[bill_id]) for subject in subjects: bill.add_subject(subject) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsorship(name=sponsor, classification='primary', primary=True, entity_type='person') for sponsor in doc.xpath( '//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace(u'\xa0', ' ').strip() bill.add_sponsorship(name=sponsor, classification='primary', primary=True, entity_type='organization') # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.get(version_url).text version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version_link( note=version. text, # Description of the version from the state; # eg, 'As introduced', 'Amended', etc. url=version.get('href'), on_duplicate='ignore', media_type='text/html' # Still a MIME type ) # actions for row in bill_div.xpath('table/tr'): date_td, chamber_td, action_td = row.xpath('td') date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = { 'Senate': 'upper', 'House': 'lower', None: 'legislature' }[chamber_td.text] action = action_td.text_content() action = action.split('(House Journal')[0] action = action.split('(Senate Journal')[0].strip() atype = action_type(action) bill.add_action( description=action, # Action description, from the state date=date.strftime('%Y-%m-%d'), # `YYYY-MM-DD` format chamber=action_chamber, # 'upper' or 'lower' classification=atype # Options explained in the next section ) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] yield from self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): spacer, number_link, _ga, title, primary_sponsor, status, spacer = row.xpath('td') # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace('No.', '') bill_id = bill_id.replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) title = title.text_content().strip() title = re.sub(r'^Title', '', title) chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime(data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action(effective_action, effective_date, chamber="executive", classification=["became-law"]) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if "*" in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(" ", " ") bill_id = bill_id.replace("*", "").replace(" ", " ").strip() if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" primary_chamber = "lower" if "H" in bill_id else "upper" # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = "%s detail page was missing title info." self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find("-") subjects = [s.strip() for s in title[: subject_pos - 1].split(",")] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) if page.xpath('//span[@id="lblCompNumber"]/a'): companion_id = page.xpath('//span[@id="lblCompNumber"]/a')[0].text_content().strip() bill.add_related_bill( identifier=companion_id, legislative_session=session, relation_type="companion", ) bill.add_source(bill_url) # Primary Sponsor sponsor = ( page.xpath("//span[@id='lblBillPrimeSponsor']")[0] .text_content() .split("by")[-1] ) sponsor = sponsor.replace("*", "").strip() if sponsor: bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True ) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link( "Current Version", btext.get("href"), media_type="application/pdf" ) # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link("Summary", summary[0].get("href")) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link("Fiscal Note", fiscal[0].get("href")) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_document_link("Amendment " + amendment.text, amendment.get("href")) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link( afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore" ) # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = ( page.xpath("//span[@id='lblCompPrimeSponsor']")[0] .text_content() .split("by")[-1] ) secondary_sponsor = ( secondary_sponsor.replace("*", "").replace(")", "").strip() ) # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification="primary", entity_type="person", primary=True, ) # secondary actions cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a["date"]) yield bill
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') (subject, ) = bill_info.xpath('td[3]//text()') subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title='', classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', classification='primary', primary=True, ) bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning("Bill {} has no webpage, and will be skipped". format(bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if (bill_doc.xpath('//span[@id="ContentPlaceHolder1_lblShotTitle"]')): title = bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]' )[0].text_content().strip() if not title: title = "[No title given by state]" bill.title = title version_url_base = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}-'. format(self.session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + 'int.pdf' elif version == "Engrossed": version_url = version_url_base + 'eng.pdf' elif version == "Enrolled": version_url = version_url_base + 'enr.pdf' else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type='application/pdf', on_duplicate='ignore', ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath('td[1]')[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT) bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath('td[3]/font/text()')[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification='other', ) try: (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value') except ValueError: bir_vote_id = '' bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text ) actions = bill_doc.xpath('//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath('td[1]/font/text()')[0]. encode('ascii', 'ignore').strip()): action_date = datetime.datetime.strptime( action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath('td[2]/font/text()') if action.xpath('td[3]/font/u/text()'): (amendment, ) = action.xpath('td[3]/font/u/text()') else: amendment = None (action_text, ) = action.xpath('td[4]/font/text()') action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = re.search( r'.*? referred to the .*? committee on (.*?)$', action_text).group(1).strip() except AttributeError: action_committee = '' act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type='organization') try: vote_button = action.xpath('td[9]//text()')[0].strip() except IndexError: vote_button = '' if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text ) if amendment: amend_url = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}.pdf'. format(self.session, amendment)) amend_name = 'Amd/Sub {}'.format(amendment) bill.add_version_link( amend_name, amend_url, media_type='application/pdf', on_duplicate='ignore', ) yield bill
def test_full_bill(): create_jurisdiction() sp = ScrapePerson('Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=sp._id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", date='1969-10-20') bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp.as_dict()]) BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' assert b.abstracts.get().date == '1969-10-20' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name='Adam Smith') for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype ) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link( link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore' ) sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + '/following-sibling::div[1]/p/a' ) for link in sponsor_links: if link.attrib['href'].startswith('https://sdlegislature.gov/Legislators/'): sponsor_type = 'person' elif link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislative_Session/Committees' ): sponsor_type = 'organization' else: raise ScrapeError( 'Found unexpected sponsor, URL: ' + link.attrib['href'] ) bill.add_sponsorship( link.text, classification='primary', primary=True, entity_type=sponsor_type ) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == '': self.debug('Skipping action table row that is completely empty') continue if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') if re.match(r'Signed by (?:the\s)*Governor', action, re.IGNORECASE): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0] version_name = amd.xpath('string(.)') version_url = amd.xpath('@href')[0] if 'htm' in version_url: mimetype = 'text/html' elif 'pdf' in version_url: mimetype = 'application/pdf' bill.add_version_link( version_name, version_url, media_type=mimetype, on_duplicate='ignore' ) if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match(r'\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ':' in name: raise Exception(name) if 'otherAuth' in link.attrib['id']: bill.add_sponsorship(name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs['committees']: related_entities.append({ 'type': 'committee', 'name': item }) for item in attrs['legislators']: related_entities.append({ 'type': 'legislator', 'name': item }) bill.add_action(description=action, date=date.strftime('%Y-%m-%d'), chamber=actor, classification=attrs['classification'], related_entities=related_entities) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib['href'] if version_url in version_urls: self.warning('Skipping duplicate version URL.') continue else: version_urls.append(version_url) name = link.text.strip() if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type='application/pdf') continue bill.add_version_link(note=name, url=version_url, media_type='application/pdf') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if 'HT_' not in link.attrib['href']: yield from self.scrape_votes(bill, self.urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill.title == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \ 'HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count += 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]' + '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type ) bill.subject = list(set(self.subject_mapping[bill_id])) for table in root.xpath('//div[@id="content"]/table'): if 'Bill Text' in table.text_content(): bill_text = table.xpath("string(tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version_link(note="Bill Text", url=text_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(name=leg, classification='primary', entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(name=leg, classification='cosponsor', entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" # bill.add_document(minutes_date, minutes_url) bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution', 9: 'petition'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \ 'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) root.make_links_absolute("http://www.leg.state.nv.us/") bill_id = root.xpath('string(/html/body/div[@id="content"]' '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = list(set(self.subject_mapping[bill_id])) billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext() text_urls = billtext.xpath("./a") for text_url in text_urls: version_name = text_url.text.strip() version_url = text_url.attrib['href'] bill.add_version_link(note=version_name, url=version_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(classification='primary', name=leg, entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(classification='cosponsor', name=leg, entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count += 1 self.scrape_actions(root, bill, "lower") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype ) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link( link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore' ) sponsor_links = page.xpath( "//td[contains(@id, 'tdSponsors')]/a") for link in sponsor_links: bill.add_sponsorship( link.text, classification='primary', primary=True, entity_type='person' ) actor = chamber use_row = False self.debug(bill_id) for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"): if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') elif action.startswith('Signed by Governor'): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match('\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])" ).strip() if not title: self.warning("blank bill on %s - skipping", url) return if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ":" in name: raise Exception(name) if "otherAuth" in link.attrib["id"]: bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True ) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs["committees"]: related_entities.append({"type": "committee", "name": item}) for item in attrs["legislators"]: related_entities.append({"type": "legislator", "name": item}) bill.add_action( description=action, date=date.strftime("%Y-%m-%d"), chamber=actor, classification=attrs["classification"], related_entities=related_entities, ) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib["href"] if version_url in version_urls: self.warning("Skipping duplicate version URL.") continue else: version_urls.append(version_url) name = link.text.strip() if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE): bill.add_document_link( note=name, url=version_url, media_type="application/pdf" ) continue bill.add_version_link( note=name, url=version_url, media_type="application/pdf" ) self.scrape_amendments(bill, page) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if "HT_" not in link.attrib["href"]: yield from self.scrape_votes(bill, self.urlescape(link.attrib["href"])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = bill.title == "Short Title Not Found." if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def scrape_bill(self, chamber, session, bill_id): # there will be a space in bill_id if we're doing a one-off bill scrape # convert HB 102 into H102 if ' ' in bill_id: bill_id = bill_id[0] + bill_id.split(' ')[-1] # if chamber comes in as House/Senate convert to lower/upper if chamber == 'Senate': chamber = 'upper' elif chamber == 'House': chamber = 'lower' bill_detail_url = ('http://www.ncleg.net/gascripts/' 'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all') % ( session, bill_id) # parse the bill data page, finding the latest html text data = self.get(bill_detail_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(bill_detail_url) title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0] if 'Joint Resolution' in title_div_txt: bill_type = 'joint resolution' bill_id = bill_id[0] + 'JR ' + bill_id[1:] elif 'Resolution' in title_div_txt: bill_type = 'resolution' bill_id = bill_id[0] + 'R ' + bill_id[1:] elif 'Bill' in title_div_txt: bill_type = 'bill' bill_id = bill_id[0] + 'B ' + bill_id[1:] bill_title = doc.xpath( '/html/body/div/div/main/div[2]/div[contains(@class,"col-12")]/a')[0] bill_title = bill_title.text_content().strip() bill = Bill(bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=bill_type) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == 'lower': link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace(u'\xa0', ' ') version_url = vlink.attrib['href'] media_type = 'text/html' if version_url.lower().endswith(".pdf"): media_type = 'application/pdf' bill.add_version_link(version_name, version_url, media_type=media_type, on_duplicate='ignore') # rows with a 'adopted' in the text and an amendment link, skip failed amds for row in doc.xpath('//div[@class="card-body"]/div[contains(., "Adopted")' ' and contains(@class,"row")]//a[@title="Amendment"]'): version_url = row.xpath('@href')[0] version_name = row.xpath('string(.)').strip() bill.add_version_link(version_name, version_url, media_type='application/pdf', on_duplicate='ignore') # sponsors spon_row = doc.xpath('//div[contains(text(), "Sponsors")]/following-sibling::div')[0] # first sponsors are primary, until we see (Primary) spon_type = 'primary' spon_lines = spon_row.text_content().replace('\r\n', ';') for leg in spon_lines.split(';'): name = leg.replace(u'\xa0', ' ').strip() if name.startswith('(Primary)') or name.endswith('(Primary)'): name = name.replace('(Primary)', '').strip() spon_type = 'cosponsor' if not name: continue bill.add_sponsorship(name, classification=spon_type, entity_type='person', primary=(spon_type == 'primary')) # keywords kw_row = doc.xpath('//div[contains(text(), "Keywords:")]/following-sibling::div')[0] for subject in kw_row.text_content().split(', '): bill.add_subject(subject) # actions action_tr_xpath = ( '//h6[contains(text(), "History")]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '/div[@class="row"]' ) # skip two header rows for row in doc.xpath(action_tr_xpath): cols = row.xpath('div') act_date = cols[1].text actor = cols[3].text or '' # if text is blank, try diving in action = (cols[5].text or '').strip() or cols[5].text_content().strip() act_date = dt.datetime.strptime(act_date, '%m/%d/%Y').strftime('%Y-%m-%d') if actor == 'Senate': actor = 'upper' elif actor == 'House': actor = 'lower' else: actor = 'executive' for pattern, atype in self._action_classifiers.items(): if action.startswith(pattern): break else: atype = None bill.add_action(action, act_date, chamber=actor, classification=atype) # TODO: Fix vote scraper # yield from self.scrape_votes(bill, doc) yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url, verify=False).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type) bill.add_source(url) for subject in self._subjects[bill_id.replace(' ', '')]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, 'short title') # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsorship(name=sponsors.strip(), entity_type="organization", primary=True, classification='primary') else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship(classification='primary', name=person, entity_type="person", primary=True) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y").strftime('%Y-%m-%d') if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' yield bill
def scrape_bills(self, chamber, session, subjects): idex = bill_start_numbers(session)[chamber] FROM = "ctl00$rilinContent$txtBillFrom" TO = "ctl00$rilinContent$txtBillTo" YEAR = "ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers(SEARCH_URL) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY blocks = self.parse_results_page(self.post(SEARCH_URL, data=default_headers).text) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill['bill_id']] except KeyError: pass title = bill['title'][len("ENTITLED, "):] billid = bill['bill_id'] try: subs = subjects[bill['bill_id']] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0] b = Bill( billid, title=title, chamber=chamber, legislative_session=session, classification=self.get_type_by_name(bill['bill_id']), ) b.subject = subs # keep bill ID around self._bill_id_by_type[(chamber, re.findall(r'\d+', billid)[0])] = billid self.process_actions(bill['actions'], b) sponsors = bill['sponsors'][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [s.strip() for s in sponsors] for href in bill['bill_id_hrefs']: b.add_version_link( href.text, href.attrib['href'], media_type="application/pdf") for sponsor in sponsors: b.add_sponsorship( sponsor, entity_type='person', classification='primary', primary=True) b.add_source(SEARCH_URL) yield b
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec['IdenticalBillNumber'].strip(): bill.add_related_bill( rec['IdenticalBillNumber'].split()[0], legislative_session=session, relation_type='companion', ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship(name, classification=sponsor_type, entity_type='person', primary=sponsor_type == 'primary') # Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format( year_abr, document.replace('.DOC', '.HTM'), ) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if rec['DocType'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) except ValueError: self.warning("Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ 'A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'rU')) except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] committee = rec['Committee_House'] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join(vote_parts).replace(' ', '_') if bill_id[0] == 'A': b_chamber = "lower" else: b_chamber = "upper" if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification='passage', result=None, bill=bill_id, bill_chamber=b_chamber, legislative_session=session, ) if leg_vote == "Y": votes[vote_id].vote('yes', leg) elif leg_vote == "N": votes[vote_id].vote('no', leg) else: votes[vote_id].vote('other', leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count['option']] += 1 vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) # Veto override. if vote.motion_text == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if vote.chamber == 'lower': vote.result = 'pass' if counts['yes'] >= 54 else 'fail' elif vote['chamber'] == 'upper': vote.result = 'pass' if counts['yes'] >= 27 else 'fail' else: # Regular vote. vote.result = 'pass' if counts['yes'] > counts['no'] else 'fail' vote.add_source('http://www.njleg.state.nj.us/downloads.asp') yield vote # Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning('probable phony bill detected %s', bill.identifier) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') yield bill if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+bill_type',None) old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if classification == ['address'] and self.state == 'nh': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor in ('upper','council') and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids', '+additional_information'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = VoteEvent(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber']) ) bill_id = info['BillNumber'].replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary') ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a' ) for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8') ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format(info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: # assert chambers_passed == set("HS") action_type = 'executive-signature' elif "Vetoed by the Governor" in action['FullStatus']: action_type = 'executive-veto' elif "Read first time" in action['FullStatus'] \ or "Read 1st time" in action['FullStatus']: action_type = 'introduction' elif "Reported favorably" in action['FullStatus']: action_type = 'committee-passage-favorable' elif actor == 'lower' and any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action( description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), chamber=actor, classification=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote['FullStatus'] or "Veto of Governor overridden" in vote['FullStatus']): did_pass = True elif ("Failed -- " in vote['FullStatus'] or 'Veto of the Governor sustained' in vote['FullStatus']): did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( bill=bill, chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize(version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime( '%m/%d/%y') version_name = "{} - {}".format( version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity( committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ( 'http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}' ).format(fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = doc.xpath( '//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0], entity_type='person', classification='primary', primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type='person', classification='cosponsor', primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsorship( spons_str, entity_type='person', classification='primary', primary=True, ) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: yield from self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action(action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'), classification=atype) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions text_list_url = ("http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s") % ( session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version_link(name, text_url, media_type="text/html") # Get documents doc_list_url = ("http://www.legis.state.ak.us/" "basis/get_documents.asp?session=%s&bill=%s") % ( session, bill_id) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document_link(h_name, h_href) yield bill
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill.subject = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version_link(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title # bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsorship( name=primary, classification='primary', entity_type='person', primary=True ) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search('(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsorship( name=name, classification='cosponsor', entity_type='person', primary=False ) else: bill.add_sponsorship( name=name, classification='cosponsor', entity_type='person', primary=False ) for link in page.xpath("//a[contains(@href, 'votes/house')]"): yield from self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed(page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): yield from self.scrape_senate_vote(bill, href, date) attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d")) temp = self.categorizer.categorize(action) related_entities = [] for key, values in temp.items(): if key != 'classification': for value in values: related_entities.append({ "type": key, "name": value }) attrs.update(classification=temp['classification'], related_entities=related_entities) bill.add_action(**attrs) yield bill
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill.subject = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version_link(**version) self.scrape_amendments(page, bill) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title # bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsorship(name=primary, classification='primary', entity_type='person', primary=True) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(r', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search(r'(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsorship(name=name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name=name, classification='cosponsor', entity_type='person', primary=False) for link in page.xpath("//a[contains(@href, 'votes/house')]"): yield from self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): yield from self.scrape_senate_vote(bill, href, date) attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d")) temp = self.categorizer.categorize(action) related_entities = [] for key, values in temp.items(): if key != 'classification': for value in values: related_entities.append({"type": key, "name": value}) attrs.update(classification=temp['classification'], related_entities=related_entities) bill.add_action(**attrs) yield bill
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data['summary'], classification=bill_type, ) if bill_data['summary']: bill.add_abstract(bill_data['summary'], note='') bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor'] is not None: if bill_data['sponsor']['rules'] is True: bill.add_sponsorship( 'Rules Committee', entity_type='organization', classification='primary', primary=True, ) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsorship( primary_sponsor['shortName'], entity_type='person', classification='primary', primary=True, ) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor['shortName'], entity_type='person', classification='cosponsor', primary=False, ) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill( companion_bill_id, companion_bill_session, relation_type='companion', ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action['text']) bill.add_action( action['text'], action_date.strftime('%Y-%m-%d'), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='') bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: yield self._parse_senate_votes(vote_data, bill, api_url) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.items(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version_link( html_version, html_url, on_duplicate='ignore', media_type='text/html', ) pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version_link( pdf_version, pdf_url, on_duplicate='ignore', media_type='application/pdf', ) yield bill
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) : title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date'] : continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) bill.add_source(leg_summary['url']) try : leg_details = self.legDetails(leg_summary['url']) except IndexError : unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []) : lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')) : sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm',)) : sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith(('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')) : bill.add_sponsorship(sponsor_name, sponsorship_type, entity_type, primary, entity_id = _make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details : for subject in leg_details[u'Topic'].split(',') : bill.add_subject(subject) for attachment in leg_details.get('Attachments', []) : if attachment['label'] : bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']) : action_description = action['Action'] try : action_date = self.toTime(action['Date']).date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : try : responsible_org = action['Action\xa0By']['label'] except TypeError : responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'Chicago City Council' act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[action_description]) if action_description == 'Referred' : try : leg_details['Current Controlling Legislative Body']['label'] controlling_bodies = [leg_details['Current Controlling Legislative Body']] except TypeError : controlling_bodies = leg_details['Current Controlling Legislative Body'] if controlling_bodies : for controlling_body in controlling_bodies : body_name = controlling_body['label'] if body_name.startswith("Joint Committee") : act.add_related_entity(body_name, 'organization') else : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification' : leg_summary['Type']} yield bill print(unreachable_urls)
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill(identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json['location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID']} if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link(note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link(note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link(note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link(note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link(note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization' bill.add_sponsorship( name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor'] ) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime( bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill( identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json[ 'location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID'] } if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link( note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor[ 'sponsorTitle'] else 'organization' bill.add_sponsorship(name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor']) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime(bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = doc.xpath('//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0], entity_type='person', classification='primary', primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type='person', classification='cosponsor', primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsorship( spons_str, entity_type='person', classification='primary', primary=True, ) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: yield from self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action( action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'), classification=atype) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions text_list_url = ( "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" ) % (session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version_link(name, text_url, media_type="text/html") # Get documents doc_list_url = ( "http://www.legis.state.ak.us/" "basis/get_documents.asp?session=%s&bill=%s" ) % (session, bill_id) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath('//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document_link(h_name, h_href) yield bill
def scrape_bill(self, chamber, session, session_id, bill_id, url): sidebar = lxml.html.fromstring(self.get(url).text) sidebar.make_links_absolute("https://www.legis.iowa.gov") hist_url = ( f"https://www.legis.iowa.gov/legislation/billTracking/" f"billHistory?billName={bill_id}&ga={session_id}" ) req_session = requests.Session() req = requests.get(hist_url) if req.status_code == 500: self.warning("500 error on {}, skipping".format(hist_url)) return page = lxml.html.fromstring(req.text) page.make_links_absolute("https://www.legis.iowa.gov") title = page.xpath( 'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])' ).strip() if title == "": # Sometimes the title is moved, see # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88 title = page.xpath( 'string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])' ).strip() if title == "": self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if "HR" in bill_id or "SR" in bill_id: bill_type = ["resolution"] elif "HJR" in bill_id or "SJR" in bill_id: bill_type = ["joint resolution"] elif "HCR" in bill_id or "SCR" in bill_id: bill_type = ["concurrent resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(hist_url) # base url for text version (version_abbrev, session_id, bill_id) version_html_url_template = ( "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/attachments/{}.html" ) version_pdf_url_template = ( "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/{}.pdf" ) # get pieces of version_link vpieces = sidebar.xpath('//select[@id="billVersions"]/option') if vpieces: for version in vpieces: version_name = version.text version_abbrev = version.xpath("string(@value)") # Get HTML document of bill version. version_html_url = version_html_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "") ) bill.add_version_link( note=version_name, url=version_html_url, media_type="text/html" ) # Get PDF document of bill version. version_pdf_url = version_pdf_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "") ) if "Marked Up" in version_name: version_pdf_url = sidebar.xpath( "//iframe[@id='bbContextDoc']/@src" )[0] bill.add_version_link( note=version_name, url=version_pdf_url, media_type="application/pdf" ) sponsors_str = page.xpath( 'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])' ).strip() if re.search("^By ", sponsors_str): sponsors = re.split(",| and ", sponsors_str.split("By ")[1]) # for some bills sponsors listed in different format else: sponsors = re.findall( r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)", sponsors_str ) for sponsor in sponsors: sponsor = sponsor.replace(" and", "").strip(" .,") # a few sponsors get mangled by our regex sponsor = { "Means": "Ways & Means", "Iowa": "Economic Growth/Rebuild Iowa", "Safety": "Public Safety", "Resources": "Human Resources", "Affairs": "Veterans Affairs", "Protection": "Environmental Protection", "Government": "State Government", "Boef": "De Boef", }.get(sponsor, sponsor) if sponsor[0].islower(): # SSBs catch cruft in it ('charges', 'overpayments') # https://sunlight.atlassian.net/browse/DATA-286 continue bill.add_sponsorship( name=sponsor, classification="primary", entity_type="person", primary=True, ) for tr in page.xpath( "//table[contains(@class, 'billActionTable')][1]/tbody/tr" ): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[3])").strip() action = re.sub(r"\s+", " ", action) # Capture any amendment links. links = [link for link in [version["links"] for version in bill.versions]] version_urls = [link["url"] for link in [i for sub in links for i in sub]] if "amendment" in action.lower(): for anchor in tr.xpath(".//a[1]"): if "-" in anchor.text: # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf" amd_id = anchor.text.replace("-", "").strip() amd_url = amd_pattern.format(session_id, amd_id) amd_name = "Amendment {}".format(anchor.text.strip()) if amd_url not in version_urls: bill.add_version_link( note=amd_name, url=amd_url, media_type="application/pdf" ) version_urls.append(amd_url) else: self.info("Already Added {}, skipping".format(amd_url)) if "S.J." in action or "SCS" in action: actor = "upper" elif "H.J." in action or "HCS" in action: actor = "lower" else: actor = "legislature" action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip() if action.startswith("Introduced"): atype = ["introduction"] if ", referred to" in action: atype.append("referral-committee") elif action.startswith("Read first time"): atype = "reading-1" elif action.startswith("Referred to"): atype = "referral-committee" elif action.startswith("Sent to Governor"): atype = "executive-receipt" elif action.startswith("Reported Signed by Governor"): atype = "executive-signature" elif action.startswith("Signed by Governor"): atype = "executive-signature" elif action.startswith("Vetoed by Governor"): atype = "executive-veto" elif action.startswith("Item veto"): atype = "executive-veto-line-item" elif re.match(r"Passed (House|Senate)", action): atype = "passage" elif re.match(r"Amendment (S|H)-\d+ filed", action): atype = ["amendment-introduction"] if ", adopted" in action: atype.append("amendment-passage") elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted", action): atype = "amendment-passage" elif re.match(r"Amendment (S|N)-\d+ lost", action): atype = "amendment-failure" elif action.startswith("Resolution filed"): atype = "introduction" elif action.startswith("Resolution adopted"): atype = "passage" elif action.startswith("Committee report") and action.endswith("passage."): atype = "committee-passage" elif action.startswith("Withdrawn"): atype = "withdrawal" else: atype = None if action.strip() == "": continue if re.search(r"END OF \d+ ACTIONS", action): continue if "$history" not in action: bill.add_action( description=action, date=date, chamber=actor, classification=atype ) self.scrape_subjects(bill, bill_id, session, req_session) yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and ( date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view': self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'].strip(), media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_details(self, bill_detail_url, session, chamber, bill_id): """ Create the Bill and add the information obtained from the provided bill_detail_url. and then yield the bill object. :param bill_detail_url: :param session: :param chamber: :param bill_id: :return: """ page = self.get(bill_detail_url).text if 'INVALID BILL NUMBER' in page: self.warning('INVALID BILL %s' % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath('span/text()')[0] if 'General Bill' in bill_type: bill_type = 'bill' elif 'Concurrent Resolution' in bill_type: bill_type = 'concurrent resolution' elif 'Joint Resolution' in bill_type: bill_type = 'joint resolution' elif 'Resolution' in bill_type: bill_type = 'resolution' else: raise ValueError('unknown bill type: %s' % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill( bill_id, legislative_session=session, # session name metadata's `legislative_sessions` chamber=chamber, # 'upper' or 'lower' title=bill_summary, classification=bill_type ) subjects = list(self._subjects[bill_id]) for subject in subjects: bill.add_subject(subject) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsorship( name=sponsor, classification='primary', primary=True, entity_type='person' ) for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace(u'\xa0', ' ').strip() bill.add_sponsorship( name=sponsor, classification='primary', primary=True, entity_type='organization' ) # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.get(version_url).text version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version_link( note=version.text, # Description of the version from the state; # eg, 'As introduced', 'Amended', etc. url=version.get('href'), on_duplicate='ignore', media_type='text/html' # Still a MIME type ) # actions for row in bill_div.xpath('table/tr'): date_td, chamber_td, action_td = row.xpath('td') date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = {'Senate': 'upper', 'House': 'lower', None: 'legislature'}[chamber_td.text] action = action_td.text_content() action = action.split('(House Journal')[0] action = action.split('(Senate Journal')[0].strip() atype = action_type(action) bill.add_action( description=action, # Action description, from the state date=date.strftime('%Y-%m-%d'), # `YYYY-MM-DD` format chamber=action_chamber, # 'upper' or 'lower' classification=atype # Options explained in the next section ) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] yield from self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) yield bill
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if '*' in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(' ', ' ') bill_id = bill_id.replace('*', '').replace(' ', ' ').strip() if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' primary_chamber = 'lower' if 'H' in bill_id else 'upper' # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = '%s detail page was missing title info.' self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find('-') subjects = [s.strip() for s in title[:subject_pos - 1].split(',')] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*', '').strip() if sponsor: bill.add_sponsorship( sponsor, classification='primary', entity_type='person', primary=True, ) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link('Current Version', btext.get('href'), media_type='application/pdf') # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link('Summary', summary[0].get('href')) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link('Fiscal Note', fiscal[0].get('href')) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_document_link('Amendment ' + amendment.text, amendment.get('href')) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link( afn.get('alt'), afn.getparent().get('href'), on_duplicate='ignore' ) # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = page.xpath( "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1] secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip() # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification='primary', entity_type='person', primary=True, ) # secondary actions cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a['date']) yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): number_link, ga, title, primary_sponsor, status = row.xpath('td') bill_id = number_link.text_content() title = title.text_content().strip() chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if not title: self.warning('blank bill on %s - skipping', url) return if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ':' in name: raise Exception(name) if 'otherAuth' in link.attrib['id']: bill.add_sponsorship(name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs['committees']: related_entities.append({ 'type': 'committee', 'name': item }) for item in attrs['legislators']: related_entities.append({ 'type': 'legislator', 'name': item }) bill.add_action(description=action, date=date.strftime('%Y-%m-%d'), chamber=actor, classification=attrs['classification'], related_entities=related_entities) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib['href'] if version_url in version_urls: self.warning('Skipping duplicate version URL.') continue else: version_urls.append(version_url) name = link.text.strip() if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type='application/pdf') continue bill.add_version_link(note=name, url=version_url, media_type='application/pdf') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if 'HT_' not in link.attrib['href']: yield from self.scrape_votes(bill, self.urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill.title == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def test_bill_update_subsubitem(): create_jurisdiction() create_org() oi = OrganizationImporter('jid') pi = PersonImporter('jid') # initial sub-subitem bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['insert'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1 # a second subsubitem, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # same thing, noop bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['noop'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # different link for second one, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/diff-link.txt', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # delete one, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1
def scrape_bill(self, row, session): bill_id = row["LegislationDisplayCode"] amendment = None substitute = None if bill_id.count(" ") > 1: if " w/ " in bill_id: self.info("Found amended bill `{}`".format(bill_id)) bill_id, amendment = bill_id.split(" w/ ") # A bill can _both_ be amended and be substituted if " for " in bill_id: self.info("Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(" for ") if amendment is None and substitute is None: raise ValueError("unknown bill_id format: " + bill_id) bill_type = self.classify_bill(bill_id) chamber = "upper" if bill_id.startswith("S") else "lower" bill = Bill( identifier=bill_id, legislative_session=session, chamber=chamber, title=row["LongTitle"], classification=bill_type, ) if row["Synopsis"]: bill.add_abstract(row["Synopsis"], "synopsis") if row["ShortTitle"]: bill.add_title(row["ShortTitle"], "short title") if row["SponsorPersonId"]: self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"], "primary") if substitute: bill.extras["substitute"] = substitute if amendment: bill.extras["amendment"] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format( row["LegislationId"] ) bill.add_source(html_url, note="text/html") html = self.lxmlize(html_url) additional_sponsors = html.xpath( '//label[text()="Additional Sponsor(s):"]' "/following-sibling::div/a/@href" ) for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "" ) self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary") cosponsors = html.xpath( '//label[text()="Co-Sponsor(s):"]/' "following-sibling::div/a/@href" ) for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "" ) self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor") versions = html.xpath( '//label[text()="Original Text:"]/following-sibling::div/a/@href' ) for version_url in versions: media_type = self.mime_from_link(version_url) version_name = "Bill Text" bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row["LegislationId"]) if row["HasAmendments"] is True: self.scrape_amendments(bill, row["LegislationId"]) yield from self.scrape_votes(bill, row["LegislationId"], session) yield bill
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') (subject, ) = bill_info.xpath('td[3]//text()') subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title='', classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', classification='primary', primary=True, ) bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning( "Bill {} has no webpage, and will be skipped".format( bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if (bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]')): title = bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]' )[0].text_content().strip() if not title: title = "[No title given by state]" bill.title = title version_url_base = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}-'.format( self.session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + 'int.pdf' elif version == "Engrossed": version_url = version_url_base + 'eng.pdf' elif version == "Enrolled": version_url = version_url_base + 'enr.pdf' else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type='application/pdf', on_duplicate='ignore', ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath('td[1]')[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT) bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath('td[3]/font/text()')[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification='other', ) try: (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value') except ValueError: bir_vote_id = '' bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text) actions = bill_doc.xpath( '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath('td[1]/font/text()')[0].encode( 'ascii', 'ignore').strip()): action_date = datetime.datetime.strptime( action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath('td[2]/font/text()') if action.xpath('td[3]/font/u/text()'): (amendment, ) = action.xpath('td[3]/font/u/text()') else: amendment = None (action_text, ) = action.xpath('td[4]/font/text()') action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = re.search( r'.*? referred to the .*? committee on (.*?)$', action_text).group(1).strip() except AttributeError: action_committee = '' act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type='organization') try: vote_button = action.xpath('td[9]//text()')[0].strip() except IndexError: vote_button = '' if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text) if amendment: amend_url = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.format( self.session, amendment)) amend_name = 'Amd/Sub {}'.format(amendment) bill.add_version_link( amend_name, amend_url, media_type='application/pdf', on_duplicate='ignore', ) yield bill
def scrape(self) : three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] if body_name != 'City Council' : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_version_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill