def scrape_bill(self, chamber, session, url): html = self.get(url).content page = lxml.html.fromstring(html) page.make_links_absolute(self.BASE_URL) if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'): bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()')[ 0 ].strip() elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'): bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/text()')[0].strip() else: self.warning("No bill id for {}".format(url)) return title = page.xpath( '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()' )[0].strip() if "B" in bill_id: _type = ["bill"] elif "J" in bill_id: _type = ["joint resolution"] elif "HS" in bill_id: _type = ["resolution"] else: raise ValueError("unknown bill type " + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type, ) bill.add_source(url) self.scrape_bill_subjects(bill, page) self.scrape_bill_sponsors(bill, page) self.scrape_bill_actions(bill, page) # fiscal note if page.xpath('//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'): fiscal_note = page.xpath( '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a' )[0] fiscal_url = fiscal_note.get("href") fiscal_title = fiscal_note.text_content() bill.add_document_link( fiscal_title, fiscal_url, media_type="application/pdf", ) # effective date, where available if page.xpath('//div[contains(text(), "Effective Date(s)")]'): eff_date = page.xpath('//div[contains(text(), "Effective Date(s)")]/text()')[0].strip() eff_date = eff_date.replace('Effective Date(s):', '').strip() # this can contain multiple dates, eg "July 1, 2020, July 1, 2022" bill.extras['date_effective'] = eff_date # yield from self.parse_bill_votes_new(doc, bill) yield bill
def scrape_bill(self, chamber, session, url): html = self.get(url).text page = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = page.xpath( '//span[@id="ctl00_CPHBody_txtTitulo"]/text()')[0].strip() bill_id = page.xpath( '//span[@id="ctl00_CPHBody_txt_Medida"]/text()')[0].strip() bill_type = self.classify_bill_type(bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) start_year = session[0:4] self.scrape_author_table(start_year, bill, bill_id) # action table contains votes, hence the yield yield from self.scrape_action_table(chamber, bill, page, url) bill.add_source(url) yield bill
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() type_abbr = re.search("type=(B|R|)", link.attrib["href"]).group(1) if type_abbr == "B": btype = ["bill"] elif type_abbr == "R": btype = ["resolution"] bill_id = "%s%s %s" % (utils.bill_abbr(chamber), type_abbr, bill_num) url = utils.info_url(chamber, session, special, type_abbr, bill_num) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) xpath = "/".join( [ '//div[contains(@class, "BillInfo-ShortTitle")]', 'div[@class="BillInfo-Section-Data"]', ] ) if page.xpath(xpath): title = page.xpath(xpath).pop().text_content().strip() else: self.warning("Skipping {} {}, No title found".format(bill_id, url)) return bill = Bill( bill_id, legislative_session=session, title=title, chamber=chamber, classification=btype, ) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history( bill, chamber, utils.history_url(chamber, session, special, type_abbr, bill_num), ) # only fetch votes if votes were seen in history # if vote_count: yield from self.parse_votes( bill, utils.vote_url(chamber, session, special, type_abbr, bill_num) ) # Dedupe sources. sources = bill.sources for source in sources: if 1 < sources.count(source): sources.remove(source) yield bill
def parse_bill(self, url): xml = self.get(url).content xml = ET.fromstring(xml) bill_num = self.get_xpath(xml, "bill/billNumber") bill_type = self.get_xpath(xml, "bill/billType") bill_id = "{} {}".format(bill_type, bill_num) chamber_name = self.get_xpath(xml, "bill/originChamber") chamber = self.chambers[chamber_name] title = self.get_xpath(xml, "bill/title") classification = self.classifications[bill_type] session = self.get_xpath(xml, "bill/congress") bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification, ) self.scrape_actions(bill, xml) self.scrape_amendments(bill, xml, session, chamber, bill_id) self.scrape_cbo(bill, xml) self.scrape_committee_reports(bill, xml) self.scrape_cosponsors(bill, xml) self.scrape_laws(bill, xml) self.scrape_related_bills(bill, xml) self.scrape_sponsors(bill, xml) self.scrape_subjects(bill, xml) self.scrape_summaries(bill, xml) self.scrape_titles(bill, xml) self.scrape_versions(bill, xml) # https://www.congress.gov/bill/116th-congress/house-bill/1 xml_url = "https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/{type}/BILLSTATUS-{congress}{type}{num}.xml" bill.add_source( xml_url.format(congress=session, type=bill_type.lower(), num=bill_num)) cg_url = ( "https://congress.gov/bill/{congress}th-congress/{chamber}-{type}/{num}" ) bill.add_source( cg_url.format( congress=session, chamber=chamber_name.lower(), type=classification.lower(), num=bill_num, )) yield bill
def test_invalid_fields_related_item(): create_jurisdiction() p1 = ScrapeBill("HB 1", "2020", "Title") p1.add_source("http://example.com") p1 = p1.as_dict() p1["sources"][0]["test"] = 3 with pytest.raises(DataImportError): BillImporter("jid").import_data([p1])
def scrape_bill(self, bill_page_url): bill_page = lxml.html.fromstring(self.get(bill_page_url).text) title = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()' ) if title: title = title[0] else: self.warning("Missing bill title {}".format(bill_page_url)) return False bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()' ) if bill_no: bill_no = bill_no[0] else: bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()' ) if bill_no: bill_no = bill_no[0] else: self.error("Missing bill number {}".format(bill_page_url)) return False bill = Bill( bill_no, legislative_session=self.session, chamber="legislature", title=title, classification="bill", ) bill.add_source(bill_page_url) self.parse_versions(bill, bill_page, bill_no) self.parse_acts(bill, bill_page) sponsors = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()' ) if sponsors: self.assign_sponsors(bill, sponsors[0], "primary") cosponsors = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()' ) if cosponsors: self.assign_sponsors(bill, cosponsors[0], "cosponsor") self.parse_date_actions(bill, bill_page) self.parse_actions(bill, bill_page) yield bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter="|") for row in page: bill_chamber = {"H": "lower", "S": "upper"}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2) bill_type = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", "MR": "memorial", "CMR": "concurrent memorial", }[type_spec] if row[-1] != self.slug: continue bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type, ) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship( primary, classification="primary", entity_type="person", primary=True, ) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % (self.slug, bill_id.replace(" ", ""))) bill.add_version_link(bill_id, version_url, media_type="application/pdf") yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def get_bill(self, bill_id, **kwargs): if bill_id == "1": assert kwargs == {"extra": "param"} raise self.ContinueScraping else: assert bill_id == "2" assert kwargs == {} b = Bill("1", self.legislative_session, "title") b.add_source("http://example.com") return b
def toy_bill(): b = Bill( identifier="HB 2017", legislative_session="2012A", title="A bill for an act to raise the cookie budget by 200%", from_organization="Foo Senate", classification="bill", ) b.add_source("http://uri.example.com/", note="foo") return b
def scrape_bill_info(self, session, chambers): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" data = self.get(info_url) page = open_csv(data) chamber_map = {"H": "lower", "S": "upper"} for row in page: if row["sess_year"] != session: continue bill_id = row["bill_num"] chamber = chamber_map[bill_id[0]] if chamber not in chambers: continue if re.match(r"^(S|H)J", bill_id): bill_type = "joint resolution" elif re.match(r"^(S|H)R", bill_id): bill_type = "resolution" else: bill_type = "bill" bill = Bill( identifier=bill_id, legislative_session=session, title=row["bill_title"], classification=bill_type, chamber=chamber, ) bill.add_source(info_url) for introducer in self._introducers[bill_id]: introducer = string.capwords( introducer.decode("utf-8").replace("Rep. ", "").replace("Sen. ", "") ) if "Dist." in introducer: introducer = " ".join(introducer.split()[:-2]) bill.add_sponsorship( name=introducer, classification="primary", primary=True, entity_type="person", ) try: for subject in self._subjects[bill_id]: bill.subject.append(subject) self.bills[bill_id] = [bill, chamber] yield from self.scrape_bill_page(bill) except SkipBill: self.warning("no such bill: " + bill_id) pass
def _recursively_process_bills( self, request_session, chamber, session, first_item=1 ): """ Once a search has been initiated, this function will save a Bill object for every Paper from the given chamber """ url = "http://legislature.maine.gov/LawMakerWeb/searchresults.asp" r = request_session.get(url, params={"StartWith": first_item}) r.raise_for_status() bills = lxml.html.fromstring(r.text).xpath("//tr/td/b/a") seen = set() if bills: for bill in bills: bill_id_slug = bill.xpath("./@href")[0] if bill_id_slug == "summary.asp?ID=280068396": continue bill_url = "http://legislature.maine.gov/LawMakerWeb/{}".format( bill_id_slug ) bill_id = bill.text[:2] + " " + bill.text[2:] if ( session in BLACKLISTED_BILL_IDS and bill_id in BLACKLISTED_BILL_IDS[session] ): continue # avoid duplicates if bill_id in seen: continue seen.add(bill_id) bill = Bill( identifier=bill_id, legislative_session=session, title="", chamber=chamber, ) bill.add_source(bill_url) yield from self.scrape_bill(bill, chamber) yield bill # Make a recursive call to this function, for the next page PAGE_SIZE = 25 yield from self._recursively_process_bills( request_session=request_session, chamber=chamber, session=session, first_item=first_item + PAGE_SIZE, )
def process_page(self): chamber = "upper" if self.input.identifier.startswith("S") else "lower" short_title = self.get_column_div("Summary").text long_title = CSS("#title").match_one(self.root).text if "*" in self.input.identifier: stars = re.search(r"\*+", self.input.identifier).group() if ( self.input.session in CARRYOVERS and stars in CARRYOVERS[self.input.session] ): self.input.identifier = re.sub( r"\*+", "-" + CARRYOVERS[self.input.session][stars], self.input.identifier, ) else: self.logger.error( f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py" ) return bill = Bill( identifier=self.input.identifier, legislative_session=self.input.session, title=short_title, chamber=chamber, ) bill.subject = self.input.subjects # use the pretty source URL bill.add_source(self.input.source_url) bill.add_title(long_title) try: sponsors = self.get_column_div("Primary Sponsor") self.add_sponsors(bill, CSS("a").match(sponsors), primary=True) except SelectorError: pass try: cosponsors = self.get_column_div("Co-Sponsor") self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False) except SelectorError: pass # TODO: figure out cosponsor div name, can't find any as of Feb 2021 self.add_actions(bill, chamber) bdr = extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr text_url = self.source.url.replace("Overview", "Text") yield BillTabText(bill, source=text_url)
def test_save_object_basics(): # ensure that save object dumps a file s = Scraper(juris, "/tmp/") p = Bill("HB 1", "2021", "Test") p.add_source("http://example.com") with mock.patch("json.dump") as json_dump: s.save_object(p) # ensure object is saved in right place filename = "bill_" + p._id + ".json" assert filename in s.output_names["bill"] json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
def test_whitespace_is_stripped(): s = Scraper(juris, "/tmp/") b = Bill(" HB 11", "2020", " a short title ") b.subject = [" one", "two ", " three "] b.add_source("https://example.com/ ") s.save_object(b) # the simple cases, and nested lists / objects assert b.identifier == "HB 11" assert b.title == "a short title" assert b.sources[0]["url"] == "https://example.com/" assert b.subject == ["one", "two", "three"]
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1) # bill data gets filled in from another call bill_data_base = ( "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/" "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}") bill_data_url = bill_data_base.format(session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, "Summary:").text short_title = short_title.replace("\u00a0", " ") bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber, ) long_title = self.get_header_field(bill_page, "Title:").text if long_title is not None: bill.add_abstract(long_title, "Summary") sponsor_div = self.get_header_field(bill_page, "Primary Sponsor") if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, "primary") cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor") if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, "cosponsor") self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr bill.extras["NV_ID"] = internal_id bill.add_source(url) yield bill
def test_save_related(): s = Scraper(juris, "/tmp/") p = Bill("HB 1", "2021", "Test") p.add_source("http://example.com") o = Bill("HB 2", "2021", "Test") o.add_source("http://example.com") p._related.append(o) with mock.patch("json.dump") as json_dump: s.save_object(p) assert json_dump.mock_calls == [ mock.call(p.as_dict(), mock.ANY, cls=mock.ANY), mock.call(o.as_dict(), mock.ANY, cls=mock.ANY), ]
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(("S", "H")): continue # create a bill desc = bill.xpath("text()")[0].strip() chamber = {"H": "lower", "S": "upper"}[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] bill = Bill( bill_id, self.kwargs["session"], desc, chamber=chamber, classification=bill_type, ) bill_url = link.get("href") sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format( self.kwargs["session_id"], bill_id.replace(" ", "")) list( self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs["subjects"][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def test_locked_field_subitem(): create_jurisdiction() bill = ScrapeBill("HB 1", "2020", "Title") bill.add_source("https://example.com") bi = BillImporter("jid") bi.import_data([bill.as_dict()]) # lock the field b = Bill.objects.get() b.locked_fields = ["sources"] b.save() # reimport (without source) bill = ScrapeBill("HB 1", "2020", "Title") bi = BillImporter("jid") bi.import_data([bill.as_dict()]) b = Bill.objects.get() assert b.sources.count() == 1 assert b.locked_fields == ["sources"]
def process_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib["href"] + "/ByCategory" if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")): bill_type = "bill" elif bill_id.startswith(("HR ", "SR ")): bill_type = "resolution" elif bill_id.startswith(("HJR ", "SJR ")): bill_type = "joint resolution" elif bill_id.startswith(("SCR ", "HCR ")): bill_type = "concurrent resolution" elif bill_id.startswith(("SM ", "HM ")): bill_type = "memorial" else: raise ValueError("Failed to identify bill type.") bill = Bill( bill_id, self.input["session"], title, chamber="lower" if bill_id[0] == "H" else "upper", classification=bill_type, ) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id) bill.subject = list(self.subjects[subj_bill_id]) sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor) sponsor = re.sub(r",\s+(Jr|Sr)\.", r" \1.", sponsor) for sp in sponsor.split(", "): sp = sp.strip() sp_type = "organization" if "committee" in sp.lower() else "person" bill.add_sponsorship(sp, "primary", sp_type, True) return BillDetail(bill)
def scrape_bill(self, chamber, session, bill_id, session_id): bill_json_url = ( "https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&" "legislativeBody={}".format(bill_id, session_id, self.chamber_map[chamber]) ) response = self.get(bill_json_url, timeout=80) page = json.loads(response.content.decode("utf-8")) if not page: self.warning("null page for %s", bill_id) return bill_title = page["ShortTitle"] bill_id = page["Number"] internal_id = page["BillId"] bill_type = self.get_bill_type(bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) self.scrape_actions(bill, page, chamber) self.scrape_versions_and_documents(bill, internal_id) self.scrape_sponsors(bill, internal_id) self.scrape_subjects(bill, internal_id) yield from self.scrape_votes(bill, page) bill_url = ( "https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}".format( internal_id, session_id ) ) bill.add_source(bill_url) bill.actions = sorted(bill.actions, key=lambda action: action["date"]) yield bill
def handle_list_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib["href"] + "/ByCategory" if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")): bill_type = "bill" elif bill_id.startswith(("HR ", "SR ")): bill_type = "resolution" elif bill_id.startswith(("HJR ", "SJR ")): bill_type = "joint resolution" elif bill_id.startswith(("SCR ", "HCR ")): bill_type = "concurrent resolution" elif bill_id.startswith(("SM ", "HM ")): bill_type = "memorial" else: raise ValueError("Failed to identify bill type.") bill = Bill( bill_id, self.kwargs["session"], title, chamber="lower" if bill_id[0] == "H" else "upper", classification=bill_type, ) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id) bill.subject = list(self.kwargs["subjects"][subj_bill_id]) sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor) for sp in sponsor.split(", "): sp = sp.strip() bill.add_sponsorship(sp, "primary", "person", True) yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill) yield bill
def scrape_prefiles(self, session): url = 'https://www.legis.iowa.gov/legislation/billTracking/prefiledBills' page = lxml.html.fromstring(self.get(url).content) page.make_links_absolute(url) for row in page.xpath('//table[contains(@class, "sortable")]/tr[td]'): title = row.xpath('td[2]/a/text()')[0].strip() url = row.xpath('td[2]/a/@href')[0] bill_id = self.extract_doc_id(title) bill = Bill( bill_id, legislative_session=session, chamber='legislature', title=title, classification='proposed bill', ) if (row.xpath('td[3]/a')): document_url = row.xpath('td[3]/a/@href')[0] if '.docx' in document_url: media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' elif '.pdf' in document_url: media_type = 'application/pdf' bill.add_document_link( note="Backround Statement", url=document_url, media_type=media_type ) bill.add_version_link( note="Prefiled", url=url, media_type="application/pdf" ) bill.add_source(url) yield bill
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize("{}{}".format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error("Skipping %s w/ 503", bill_url) return else: raise bill_number = page.xpath( '//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()' )[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])' ) bill_summary = bill_summary.replace("Read More", "").strip() bill = Bill( bill_number, legislative_session=session, chamber=chamber, title=bill_title ) if bill_summary: bill.add_abstract(bill_summary, "summary") bill.add_source("{}{}".format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_amendments(bill, page) yield bill yield from self.scrape_votes(session, bill, page)
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field( page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) version_ct = self.parse_versions(page, bill) if version_ct < 1: # Bill withdrawn self.logger.warning("Bill withdrawn.") return self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath( "//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.get(url).json() api_id = page["BillId"] if re.match(r"^(S|H)B ", bill_id): btype = ["bill"] elif re.match(r"(S|H)C ", bill_id): btype = ["commemoration"] elif re.match(r"(S|H)JR ", bill_id): btype = ["joint resolution"] elif re.match(r"(S|H)CR ", bill_id): btype = ["concurrent resolution"] else: btype = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype, ) bill.add_source(f"https://sdlegislature.gov/Session/Bill/{api_id}") bill.add_source(url) version_rows = page["Documents"] assert len(version_rows) > 0 for version in version_rows: date = version["DocumentDate"] if date: match = re.match(r"\d{4}-\d{2}-\d{2}", date) date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d").date() html_link = f"https://sdlegislature.gov/Session/Bill/{api_id}/{version['DocumentId']}" pdf_link = f"https://mylrc.sdlegislature.gov/api/Documents/{version['DocumentId']}.pdf" note = version["BillVersion"] bill.add_version_link( note, html_link, date=date, media_type="text/html", on_duplicate="ignore", ) bill.add_version_link( note, pdf_link, date=date, media_type="application/pdf", on_duplicate="ignore", ) else: self.warning("Version listed but no date or documents") sponsors = page["BillSponsor"] if sponsors: for sponsor in sponsors: sponsor_type = "person" member = sponsor["Member"] # first and last name are available, but UniqueName is the old link text # could change later? bill.add_sponsorship( member["UniqueName"], classification="primary", primary=True, entity_type=sponsor_type, ) else: sponsor_type = "organization" committee_sponsor = re.search(r">(.*)</a>", page["BillCommitteeSponsor"])[1] bill.add_sponsorship( committee_sponsor, classification="primary", primary=True, entity_type=sponsor_type, ) for keyword in page["Keywords"]: bill.add_subject(keyword["Keyword"]["Keyword"]) actions_url = f"https://sdlegislature.gov/api/Bills/ActionLog/{api_id}" yield from self.scrape_action(bill, actions_url, chamber) yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) bill_id_for_url = bill_id.replace(" ", "") bill.add_source( f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}" ) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) for version in root.iterfind( "billtext/docTypes/bill/versions/version"): if not version: continue note = version.find("versionDescription").text html_url = version.find("WebHTMLURL").text bill.add_version_link(note=note, url=html_url, media_type="text/html") pdf_url = version.find("WebPDFURL").text bill.add_version_link(note=note, url=pdf_url, media_type="application/pdf") for analysis in root.iterfind( "billtext/docTypes/analysis/versions/version"): if not analysis: continue description = analysis.find("versionDescription").text html_url = analysis.find("WebHTMLURL").text bill.add_document_link( note="Analysis ({})".format(description), url=html_url, media_type="text/html", ) for fiscal_note in root.iterfind( "billtext/docTypes/fiscalNote/versions/version"): if not fiscal_note: continue description = fiscal_note.find("versionDescription").text html_url = fiscal_note.find("WebHTMLURL").text bill.add_document_link( note="Fiscal Note ({})".format(description), url=html_url, media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue atype = _categorize_action(desc) act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = "{}{}".format(qs["billtype"], qs["billnumber"]) versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta["Report Title"].split(";")] if "" in subs: subs.remove("") b = Bill( bill_id, session, meta["Measure Title"], chamber=chamber, classification=bill_type, ) if meta["Description"]: b.add_abstract(meta["Description"], "description") for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = "{} Regular Session".format(str(int(session[:4]) - 1)) companion = meta["Companion"].strip() if companion: b.add_related_bill( identifier=companion.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) if bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" ): prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if "carried over" in prior.lower(): b.add_related_bill( identifier=bill_id.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) for sponsor in meta["Introducer(s)"]: if "(Introduced by request of another party)" in sponsor: sponsor = sponsor.replace( " (Introduced by request of another party)", "") b.add_sponsorship(sponsor, "primary", "person", True) self.parse_bill_versions_table(b, versions) self.parse_testimony(b, bill_page) self.parse_cmte_reports(b, bill_page) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) for category in self._categories: leg_listing_url = ( self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}" ) resp = requests.post(leg_listing_url, headers=self._headers, verify=False,) resp.raise_for_status() leg_listing = resp.json() for leg in leg_listing: bill = Bill( leg["legislationNumber"], legislative_session=session, title=leg["title"], classification=category["name"], ) bill.add_source(leg_listing_url) bill_url = ( f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}" ) bill.add_source(bill_url) if leg['lawNumber']: bill.extras['lawNumber'] = leg['lawNumber'] # Actions for hist in leg["legislationHistory"]: hist_date = datetime.datetime.strptime( hist["actionDate"], "%b %d, %Y" ) hist_date = self._TZ.localize(hist_date) hist_action = hist["actionDescription"] if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]: hist_action = hist_action[5:] hist_class = self.classify_action(hist_action) if "mayor" in hist_action.lower(): actor = "executive" else: actor = "legislature" bill.add_action( hist_action, hist_date, classification=hist_class, chamber=actor ) # Documents with download links if hist["downloadURL"] and ("download" in hist["downloadURL"]): download = hist["downloadURL"] if not download.startswith("http"): download = "https://lims.dccouncil.us/" + download mimetype = ( "application/pdf" if download.endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in download.lower(): is_version = True doc_type = vt if "amendment" in download.lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, download, media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( hist["actionDescription"], download, media_type=mimetype, on_duplicate="ignore", ) # Grabs Legislation details leg_details_url = ( self._API_BASE_URL + f"LegislationDetails/{leg['legislationNumber']}" ) details_resp = requests.get( leg_details_url, headers=self._headers, verify=False, ) details_resp.raise_for_status() leg_details = details_resp.json() # Sponsors for i in leg_details["introducers"]: name = i["memberName"] bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True, ) # Co-sponsor if leg_details["coSponsors"]: for cs in leg_details["coSponsors"]: name = i["memberName"] bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=True, ) # Committee Hearing Doc for commHearing in leg_details["committeeHearing"]: if commHearing["hearingRecord"]: bill.add_document_link( commHearing["hearingType"], commHearing["hearingRecord"], media_type="application/pdf", on_duplicate="ignore", ) for committeeMarkup in leg_details["committeeMarkup"]: if committeeMarkup["committeeReport"]: bill.add_document_link( "Committee Markup", committeeMarkup["committeeReport"], media_type="application/pdf", on_duplicate="ignore", ) # Actions and Votes if leg_details["actions"]: # To prevent duplicate votes vote_ids = [] for act in leg_details["actions"]: action_name = act["action"] action_date = datetime.datetime.strptime( act["actionDate"][:10], "%Y-%m-%d" ) action_date = self._TZ.localize(action_date) if action_name.split()[0] == "Other": action_name = " ".join(action_name.split()[1:]) if "mayor" in action_name.lower(): actor = "executive" else: actor = "legislature" # Documents and Versions if act["attachment"]: mimetype = ( "application/pdf" if act["attachment"].endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in act["attachment"].lower(): is_version = True doc_type = vt if "amendment" in act["attachment"].lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) # Votes if act["voteDetails"]: result = act["voteDetails"]["voteResult"] if result: status = self._vote_statuses[result.lower()] id_text = ( str(leg["legislationNumber"]) + "-" + action_name + "-" + result ) if id_text not in vote_ids: vote_ids.append(id_text) action_class = self.classify_action(action_name) v = VoteEvent( identifier=id_text, chamber=actor, start_date=action_date, motion_text=action_name, result=status, classification=action_class, bill=bill, ) v.add_source(leg_listing_url) yes_count = ( no_count ) = absent_count = abstain_count = other_count = 0 for leg_vote in act["voteDetails"]["votes"]: mem_name = leg_vote["councilMember"] if leg_vote["vote"] == "Yes": yes_count += 1 v.yes(mem_name) elif leg_vote["vote"] == "No": no_count += 1 v.no(mem_name) elif leg_vote["vote"] == "Absent": absent_count += 1 v.vote("absent", mem_name) elif leg_vote["vote"] == "Recused": v.vote("abstain", mem_name) abstain_count += 1 elif leg_vote["vote"] == "Present": v.vote("other", mem_name) other_count += 1 else: # Incase anything new pops up other_count += 1 v.vote("other", mem_name) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("absent", absent_count) v.set_count("abstain", abstain_count) v.set_count("other", other_count) yield v yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type="text/html", ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type="text/html", ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue introduced = False if desc == "Amended": atype = "amendment-passage" elif desc == "Amendment(s) offered": atype = "amendment-introduction" elif desc == "Amendment amended": atype = "amendment-amendment" elif desc == "Amendment withdrawn": atype = "amendment-withdrawal" elif desc == "Passed" or desc == "Adopted": atype = "passage" elif re.match(r"^Received (by|from) the", desc): if "Secretary of the Senate" not in desc: atype = "introduction" else: atype = "filing" elif desc.startswith("Sent to the Governor"): # But what if it gets lost in the mail? atype = "executive-receipt" elif desc.startswith("Signed by the Governor"): atype = "executive-signature" elif desc.startswith("Effective on"): atype = "became-law" elif desc == "Vetoed by the Governor": atype = "executive-veto" elif desc == "Read first time": atype = ["introduction", "reading-1"] introduced = True elif desc == "Read & adopted": atype = ["passage"] if not introduced: introduced = True atype.append("introduction") elif desc == "Passed as amended": atype = "passage" elif desc.startswith("Referred to") or desc.startswith( "Recommended to be sent to "): atype = "referral-committee" elif desc == "Reported favorably w/o amendment(s)": atype = "committee-passage" elif desc == "Filed": atype = "filing" elif desc == "Read 3rd time": atype = "reading-3" elif desc == "Read 2nd time": atype = "reading-2" elif desc.startswith("Reported favorably"): atype = "committee-passage-favorable" else: atype = None act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape(self, session=None): HTML_TAGS_RE = r"<.*?>" if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format( year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)["data"] or [] bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format( year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)["data"] or []) resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format( year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)["data"] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info["BillNumber"].startswith("J.R.H."): bill_type = "joint resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("J.R.S."): bill_type = "joint resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.C.R."): bill_type = "concurrent resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.C.R."): bill_type = "concurrent resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.R."): bill_type = "resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.R."): bill_type = "resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("PR."): bill_type = "constitutional amendment" if info["Body"] == "H": bill_chamber = "lower" elif info["Body"] == "S": bill_chamber = "upper" else: raise AssertionError("Amendment not tied to chamber") elif info["BillNumber"].startswith("H."): bill_type = "bill" bill_chamber = "lower" elif info["BillNumber"].startswith("S."): bill_type = "bill" bill_chamber = "upper" else: raise AssertionError("Unknown bill type found: '{}'".format( info["BillNumber"])) bill_id_original_format = (info["BillNumber"].replace(".", "").replace( " ", "")) bill_id = bill_id_original_format # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info["Title"], classification=bill_type, ) if "resolution" in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format( year_slug, info["BillNumber"]) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' "following-sibling::dd[1]/ul/li") sponsor_type = "primary" for sponsor in sponsors: if sponsor.xpath("span/text()") == ["Additional Sponsors"]: sponsor_type = "cosponsor" continue sponsor_name = (sponsor.xpath("a/text()")[0].replace( "Rep.", "").replace("Sen.", "").strip()) if sponsor_name and not (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type="person", primary=(sponsor_type == "primary"), ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' "following-sibling::dd[1]/ul/li/a |" '//ul[@class="bill-path"]//a') for version in versions: if version.xpath("text()"): bill.add_version_link( note=version.xpath("text()")[0], url=version.xpath("@href")[0].replace(" ", "%20"), media_type="application/pdf", ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode("utf-8"), ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format( info["BillNumber"])) yield bill continue # Capture actions actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format( year_slug, internal_bill_id) actions_json = self.get(actions_url) # Checks if page actually has json posted if "json" in actions_json.headers.get("Content-Type"): actions = json.loads(actions_json.text)["data"] # Checks to see if any data is actually there if actions == "": continue else: continue bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action["FullStatus"]: actor = "executive" elif action["ChamberCode"] == "H": actor = "lower" elif action["ChamberCode"] == "S": actor = "upper" else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action["FullStatus"]: # assert chambers_passed == set("HS") action_type = "executive-signature" elif "Vetoed by the Governor" in action["FullStatus"]: action_type = "executive-veto" elif ("Read first time" in action["FullStatus"] or "Read 1st time" in action["FullStatus"]): action_type = "introduction" elif "Reported favorably" in action["FullStatus"]: action_type = "committee-passage-favorable" elif actor == "lower" and any( x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("H") elif actor == "upper" and any( x.lower().startswith(" aspassed") or x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("S") else: action_type = None # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.511 action["StatusDate"] = action["StatusDate"].replace( "/0209", "/2019") # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.754 if bill_id == "H 754" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0202", "/2020") # https://legislature.vermont.gov/bill/status/2020/H.942 if bill_id == "H 942" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0200", "/2020") action_date = datetime.datetime.strftime( datetime.datetime.strptime(action["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) # strftime doesn't always pad year value (%Y) (https://bugs.python.org/issue32195) # and sometimes this state has typos in year part of the StatusDate value # which can cause validation errors, so fix leading zeroes if they are missing if action_date.find("-") < 4: action_date = ("0" * (4 - action_date.find("-"))) + action_date bill.add_action( description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]), date=action_date, chamber=actor, classification=action_type, ) # Capture votes votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)["data"] bill.add_source(votes_url) for vote in votes: roll_call_id = vote["VoteHeaderID"] roll_call_url = ("http://legislature.vermont.gov/bill/" "loadBillRollCallDetails/{0}/{1}".format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)["data"] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member["MemberName"].split(" of ") member_name = member_name.strip() if member["MemberVote"] == "Yea": roll_call_yea.append(member_name) elif member["MemberVote"] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote["FullStatus"] # seems like we've seen both or "Governor overridden" in vote["FullStatus"] or "Governor overriden" in vote["FullStatus"]): did_pass = True elif ("Failed -- " in vote["FullStatus"] or "Veto of the Governor sustained" in vote["FullStatus"]): did_pass = False else: raise AssertionError("Roll call vote result is unclear: " + vote["FullStatus"]) # Check vote counts yea_count = int( re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1)) nay_count = int( re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1)) vote_start_date = datetime.datetime.strftime( datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) motion_text = re.sub(HTML_TAGS_RE, "", vote["FullStatus"]).strip() vote_identifer = (vote["StatusDate"] + "--" + motion_text + "--" + roll_call_url) vote_to_add = VoteEvent( identifier=vote_identifer, bill=bill, chamber=("lower" if vote["ChamberCode"] == "H" else "upper"), start_date=vote_start_date, motion_text=motion_text, result="pass" if did_pass else "fail", classification="passage", legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count("yes", yea_count) vote_to_add.set_count("no", nay_count) vote_to_add.set_count("not voting", len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote("not voting", member) yield vote_to_add # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format( bill_id_original_format) bill.add_document_link(note="Witness List", url=witnesses_doc_link_url, media_type="text/html") # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format( bill_id_original_format) page = self.lxmlize(conferees_doc_link_url) no_data = page.xpath('//div[@class="no-data"]/text()') if not no_data: bill.add_document_link( note="Conference Committee Members", url=conferees_doc_link_url, media_type="text/html", ) # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format( bill_id_original_format) bill.add_document_link( note="Committee Meetings", url=meetings_doc_link_url, media_type="text/html", ) yield bill