def scrape_bill(self, chamber, session, bill_id, title, url): page = self.get(url).json() api_id = page["BillId"] if re.match(r"^(S|H)B ", bill_id): btype = ["bill"] elif re.match(r"(S|H)C ", bill_id): btype = ["commemoration"] elif re.match(r"(S|H)JR ", bill_id): btype = ["joint resolution"] elif re.match(r"(S|H)CR ", bill_id): btype = ["concurrent resolution"] else: btype = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype, ) bill.add_source(f"https://sdlegislature.gov/Session/Bill/{api_id}") bill.add_source(url) version_rows = page["Documents"] assert len(version_rows) > 0 for version in version_rows: date = version["DocumentDate"] if date: match = re.match(r"\d{4}-\d{2}-\d{2}", date) date = datetime.datetime.strptime(match.group(0), "%Y-%m-%d").date() html_link = f"https://sdlegislature.gov/Session/Bill/{api_id}/{version['DocumentId']}" pdf_link = f"https://mylrc.sdlegislature.gov/api/Documents/{version['DocumentId']}.pdf" note = version["BillVersion"] bill.add_version_link( note, html_link, date=date, media_type="text/html", on_duplicate="ignore", ) bill.add_version_link( note, pdf_link, date=date, media_type="application/pdf", on_duplicate="ignore", ) else: self.warning("Version listed but no date or documents") sponsors = page["BillSponsor"] if sponsors: for sponsor in sponsors: sponsor_type = "person" member = sponsor["Member"] # first and last name are available, but UniqueName is the old link text # could change later? bill.add_sponsorship( member["UniqueName"], classification="primary", primary=True, entity_type=sponsor_type, ) else: sponsor_type = "organization" committee_sponsor = re.search(r">(.*)</a>", page["BillCommitteeSponsor"])[1] bill.add_sponsorship( committee_sponsor, classification="primary", primary=True, entity_type=sponsor_type, ) for keyword in page["Keywords"]: bill.add_subject(keyword["Keyword"]["Keyword"]) actions_url = f"https://sdlegislature.gov/api/Bills/ActionLog/{api_id}" yield from self.scrape_action(bill, actions_url, chamber) yield bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = "{}{}".format(qs["billtype"], qs["billnumber"]) versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta["Report Title"].split(";")] if "" in subs: subs.remove("") b = Bill( bill_id, session, meta["Measure Title"], chamber=chamber, classification=bill_type, ) if meta["Description"]: b.add_abstract(meta["Description"], "description") for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = "{} Regular Session".format(str(int(session[:4]) - 1)) companion = meta["Companion"].strip() if companion: b.add_related_bill( identifier=companion.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) if bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" ): prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if "carried over" in prior.lower(): b.add_related_bill( identifier=bill_id.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) for sponsor in meta["Introducer(s)"]: if "(Introduced by request of another party)" in sponsor: sponsor = sponsor.replace( " (Introduced by request of another party)", "") b.add_sponsorship(sponsor, "primary", "person", True) self.parse_bill_versions_table(b, versions) self.parse_testimony(b, bill_page) self.parse_cmte_reports(b, bill_page) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) bill_id_for_url = bill_id.replace(" ", "") bill.add_source( f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}" ) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) for version in root.iterfind( "billtext/docTypes/bill/versions/version"): if not version: continue note = version.find("versionDescription").text html_url = version.find("WebHTMLURL").text bill.add_version_link(note=note, url=html_url, media_type="text/html") pdf_url = version.find("WebPDFURL").text bill.add_version_link(note=note, url=pdf_url, media_type="application/pdf") for analysis in root.iterfind( "billtext/docTypes/analysis/versions/version"): if not analysis: continue description = analysis.find("versionDescription").text html_url = analysis.find("WebHTMLURL").text bill.add_document_link( note="Analysis ({})".format(description), url=html_url, media_type="text/html", ) for fiscal_note in root.iterfind( "billtext/docTypes/fiscalNote/versions/version"): if not fiscal_note: continue description = fiscal_note.find("versionDescription").text html_url = fiscal_note.find("WebHTMLURL").text bill.add_document_link( note="Fiscal Note ({})".format(description), url=html_url, media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue atype = _categorize_action(desc) act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill(self, chamber, session, bill_id): # there will be a space in bill_id if we're doing a one-off bill scrape # convert HB 102 into H102 if " " in bill_id: bill_id = bill_id[0] + bill_id.split(" ")[-1] # if chamber comes in as House/Senate convert to lower/upper if chamber == "Senate": chamber = "upper" elif chamber == "House": chamber = "lower" bill_detail_url = ( "http://www.ncleg.net/gascripts/" "BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all" ) % (session, bill_id) # parse the bill data page, finding the latest html text data = self.get(bill_detail_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(bill_detail_url) title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0] if "Joint Resolution" in title_div_txt: bill_type = "joint resolution" bill_id = bill_id[0] + "JR " + bill_id[1:] elif "Resolution" in title_div_txt: bill_type = "resolution" bill_id = bill_id[0] + "R " + bill_id[1:] elif "Bill" in title_div_txt: bill_type = "bill" bill_id = bill_id[0] + "B " + bill_id[1:] bill_title = doc.xpath("//main//div[@class='col-12'][1]")[0] bill_title = bill_title.text_content().strip() # For special cases where bill title is blank, a new title is created using Bill ID if not bill_title: bill_title = bill_id.replace(" ", "") bill = Bill( bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=bill_type, ) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == "lower": link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace("\xa0", " ") version_url = vlink.attrib["href"] media_type = "text/html" if version_url.lower().endswith(".pdf"): media_type = "application/pdf" bill.add_version_link(version_name, version_url, media_type=media_type, on_duplicate="ignore") # rows with a 'adopted' in the text and an amendment link, skip failed amds for row in doc.xpath( '//div[@class="card-body"]/div[contains(., "Adopted")' ' and contains(@class,"row")]//a[@title="Amendment"]'): version_url = row.xpath("@href")[0] version_name = row.xpath("string(.)").strip() bill.add_version_link( version_name, version_url, media_type="application/pdf", on_duplicate="ignore", ) # sponsors spon_row = doc.xpath( '//div[contains(text(), "Sponsors")]/following-sibling::div')[0] # first sponsors are primary, until we see (Primary) spon_type = "primary" spon_lines = spon_row.text_content().replace("\r\n", ";").replace("\n", ";") for leg in spon_lines.split(";"): name = leg.replace("\xa0", " ").strip() if name.startswith("(Primary)") or name.endswith("(Primary)"): name = name.replace("(Primary)", "").strip() spon_type = "cosponsor" if not name: continue bill.add_sponsorship( name, classification=spon_type, entity_type="person", primary=(spon_type == "primary"), ) # keywords kw_row = doc.xpath( '//div[contains(text(), "Keywords:")]/following-sibling::div')[0] for subject in kw_row.text_content().split(", "): bill.add_subject(subject) # actions action_tr_xpath = ('//h6[contains(text(), "History")]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '/div[@class="row"]') # skip two header rows for row in doc.xpath(action_tr_xpath): cols = row.xpath("div") act_date = cols[1].text actor = cols[3].text or "" # if text is blank, try diving in action = (cols[5].text or "").strip() or cols[5].text_content().strip() if act_date is None: search_action_date = action.split() for act in search_action_date: try: if "/" in act: # try: act_date = dt.datetime.strptime( act, "%m/%d/%Y").strftime("%Y-%m-%d") except KeyError: raise Exception("No Action Date Provided") else: act_date = dt.datetime.strptime( act_date, "%m/%d/%Y").strftime("%Y-%m-%d") if actor == "Senate": actor = "upper" elif actor == "House": actor = "lower" else: actor = "executive" for pattern, atype in self._action_classifiers.items(): if action.startswith(pattern): break else: atype = None if act_date is not None: bill.add_action(action, act_date, chamber=actor, classification=atype) # TODO: Fix vote scraper for row in doc.xpath("//h6[@id='vote-header']"): yield from self.scrape_votes(bill, doc) # For archived votes if session in ["1997", "1999"]: yield from self.add_archived_votes(bill, bill_id) yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type="text/html", ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type="text/html", ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue introduced = False if desc == "Amended": atype = "amendment-passage" elif desc == "Amendment(s) offered": atype = "amendment-introduction" elif desc == "Amendment amended": atype = "amendment-amendment" elif desc == "Amendment withdrawn": atype = "amendment-withdrawal" elif desc == "Passed" or desc == "Adopted": atype = "passage" elif re.match(r"^Received (by|from) the", desc): if "Secretary of the Senate" not in desc: atype = "introduction" else: atype = "filing" elif desc.startswith("Sent to the Governor"): # But what if it gets lost in the mail? atype = "executive-receipt" elif desc.startswith("Signed by the Governor"): atype = "executive-signature" elif desc.startswith("Effective on"): atype = "became-law" elif desc == "Vetoed by the Governor": atype = "executive-veto" elif desc == "Read first time": atype = ["introduction", "reading-1"] introduced = True elif desc == "Read & adopted": atype = ["passage"] if not introduced: introduced = True atype.append("introduction") elif desc == "Passed as amended": atype = "passage" elif desc.startswith("Referred to") or desc.startswith( "Recommended to be sent to "): atype = "referral-committee" elif desc == "Reported favorably w/o amendment(s)": atype = "committee-passage" elif desc == "Filed": atype = "filing" elif desc == "Read 3rd time": atype = "reading-3" elif desc == "Read 2nd time": atype = "reading-2" elif desc.startswith("Reported favorably"): atype = "committee-passage-favorable" else: atype = None act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_details(self, bill_detail_url, session, chamber, bill_id): """ Create the Bill and add the information obtained from the provided bill_detail_url. and then yield the bill object. :param bill_detail_url: :param session: :param chamber: :param bill_id: :return: """ page = self.get(bill_detail_url).text if "INVALID BILL NUMBER" in page: self.warning("INVALID BILL %s" % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath("span/text()")[0] if "General Bill" in bill_type: bill_type = "bill" elif "Concurrent Resolution" in bill_type: bill_type = "concurrent resolution" elif "Joint Resolution" in bill_type: bill_type = "joint resolution" elif "Resolution" in bill_type: bill_type = "resolution" else: raise ValueError("unknown bill type: %s" % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill( bill_id, legislative_session=session, # session name metadata's `legislative_sessions` chamber=chamber, # 'upper' or 'lower' title=bill_summary, classification=bill_type, ) subjects = list(self._subjects[bill_id]) for subject in subjects: bill.add_subject(subject) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsorship( name=sponsor, classification="primary", primary=True, entity_type="person", ) for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace("\xa0", " ").strip() bill.add_sponsorship( name=sponsor, classification="primary", primary=True, entity_type="organization", ) # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.get(version_url).text version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version_link( note=version.text, # Description of the version from the state; # eg, 'As introduced', 'Amended', etc. url=version.get("href"), on_duplicate="ignore", media_type="text/html", # Still a MIME type ) # actions for row in bill_div.xpath("table/tr"): date_td, chamber_td, action_td = row.xpath("td") date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = {"Senate": "upper", "House": "lower", None: "legislature"}[ chamber_td.text ] action = action_td.text_content() action = action.split("(House Journal")[0] action = action.split("(Senate Journal")[0].strip() atype = action_type(action) bill.add_action( description=action, # Action description, from the state date=date.strftime("%Y-%m-%d"), # `YYYY-MM-DD` format chamber=action_chamber, # 'upper' or 'lower' classification=atype, # Options explained in the next section ) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] yield from self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) yield bill
def scrape_bills(self, chamber_to_scrape, session): url = ( "http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml" % session) bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath("//LASTACTION/MSRGROUP"): bill_id = mr.xpath("string(MEASURE)").replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = { "B": "bill", "C": "concurrent resolution", "R": "resolution", "N": "nomination", }[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath("string(ACTIONLINK)").replace("..", "") main_doc = mr.xpath("string(MEASURELINK)").replace("../../../", "") main_doc_url = "http://billstatus.ls.state.ms.us/%s" % main_doc bill_details_url = "http://billstatus.ls.state.ms.us/%s/pdf%s" % ( session, link, ) try: details_page = self.get(bill_details_url) except scrapelib.HTTPError: self.warning( "Bill page not loading for {}; skipping".format(bill_id)) continue page = details_page.content # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath("string(//SHORTTITLE)") longtitle = details_root.xpath("string(//LONGTITLE)") if title == "": self.warning(f"No title yet for {bill_id}, skipping") return bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.extras["summary"] = longtitle bill.add_source(main_doc_url) # sponsors main_sponsor = details_root.xpath("string(//P_NAME)").split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath( "string(//P_LINK)").replace(" ", "_") main_sponsor_url = ("http://billstatus.ls.state.ms.us/%s/" "pdf/%s") % ( session, main_sponsor_link.strip("../"), ) type = "primary" bill.add_source(main_sponsor_url) bill.add_sponsorship( self.clean_voter_name(main_sponsor), classification=type, entity_type="person", primary=True, ) for author in details_root.xpath("//AUTHORS/ADDITIONAL"): leg = author.xpath("string(CO_NAME)").replace(" ", "_") if leg: leg_url = ("http://billstatus.ls.state.ms.us/%s/" "pdf/House_authors/%s.xml") % (session, leg) type = "cosponsor" bill.add_source(leg_url) bill.add_sponsorship( self.clean_voter_name(leg), classification=type, entity_type="person", primary=False, ) # Versions curr_version = details_root.xpath("string(//CURRENT_OTHER" ")").replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version_link( "Current version", curr_version_url, on_duplicate="ignore", media_type="text/html", ) curr_pdf_url = re.sub("html?", "pdf", curr_version_url) bill.add_version_link( "Current version", curr_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) intro_version = details_root.xpath( "string(//INTRO_OTHER)").replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version_link( "As Introduced", intro_version_url, on_duplicate="ignore", media_type="text/html", ) intro_pdf_url = re.sub("html?", "pdf", intro_version_url) bill.add_version_link( "As Introduced", intro_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) comm_version = details_root.xpath("string(//CMTESUB_OTHER" ")").replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version_link( "Committee Substitute", comm_version_url, on_duplicate="ignore", media_type="text/html", ) comm_pdf_url = re.sub("html?", "pdf", comm_version_url) bill.add_version_link( "Committee Substitute", comm_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) passed_version = details_root.xpath("string(//PASSED_OTHER" ")").replace( "../../../../", "") if passed_version.find("documents") != -1: passed_version_url = ("http://billstatus.ls.state.ms.us/" + passed_version) title = "As Passed the " + chamber bill.add_version_link( title, passed_version_url, on_duplicate="ignore", media_type="text/html", ) passed_pdf_url = re.sub("html?", "pdf", passed_version_url) bill.add_version_link( title, passed_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) asg_version = details_root.xpath("string(//ASG_OTHER)").replace( "../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version_link( "Approved by the Governor", asg_version_url, on_duplicate="ignore", media_type="text/html", ) asg_pdf_url = re.sub("html?", "pdf", asg_version_url) bill.add_version_link( "Approved by the Governor", asg_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) # amendments # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml for amd in details_root.xpath("//AMENDMENTS/*"): if amd.tag == "HAM": name = amd.xpath("HAM_DESC[1]/text()")[0] name = append_parens(amd, "HAM_DISP", name) name = append_parens(amd, "HAM_VDESC", name) pdf_url = amd.xpath("string(HAM_PDF" ")").replace("../", "") html_url = amd.xpath("string(HAM_OTHER" ")").replace("../", "") elif amd.tag == "SAM": name = amd.xpath("SAM_DESC[1]/text()")[0] name = append_parens(amd, "SAM_DISP", name) name = append_parens(amd, "SAM_VDESC", name) pdf_url = amd.xpath("string(SAM_PDF" ")").replace("../", "") html_url = amd.xpath("string(SAM_OTHER" ")").replace("../", "") elif amd.tag == "AMRPT": name = amd.xpath("AMRPT_DESC[1]/text()")[0] pdf_url = amd.xpath("string(AMRPT_PDF" ")").replace("../", "") html_url = amd.xpath("string(AMRPT_OTHER" ")").replace("../", "") pdf_url = "http://billstatus.ls.state.ms.us/" + pdf_url html_url = "http://billstatus.ls.state.ms.us/" + html_url if "adopted" in name.lower( ) or "amendment report" in name.lower(): bill.add_version_link( name, pdf_url, on_duplicate="ignore", media_type="application/pdf", ) bill.add_version_link(name, html_url, on_duplicate="ignore", media_type="text/html") # avoid duplicate votes seen_votes = set() # Actions for action in details_root.xpath("//HISTORY/ACTION"): # action_num = action.xpath('string(ACT_NUMBER)').strip() # action_num = int(action_num) act_vote = action.xpath("string(ACT_VOTE)").replace( "../../../..", "") action_desc = action.xpath("string(ACT_DESC)") date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if "Veto" in action and actor == "executive": version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document_link("Veto", version_url) atype = "other" for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action( action, self._tz.localize(date), chamber=actor, classification=atype if atype != "other" else None, ) # use committee names as scraped subjects subjects = details_root.xpath("//H_NAME/text()") subjects += details_root.xpath("//S_NAME/text()") for subject in subjects: if subject not in bill.subject: bill.add_subject(subject) if act_vote: vote_url = "http://billstatus.ls.state.ms.us%s" % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) yield from self.scrape_votes(vote_url, action, date, actor, bill) bill.add_source(bill_details_url) yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if "*" in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(" ", " ") bill_id = bill_id.replace("*", "").replace(" ", " ").strip() if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" primary_chamber = "lower" if "H" in bill_id else "upper" # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = "%s detail page was missing title info." self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find("-") subjects = [s.strip() for s in title[:subject_pos - 1].split(",")] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) if page.xpath('//span[@id="lblCompNumber"]/a'): companion_id = (page.xpath('//span[@id="lblCompNumber"]/a') [0].text_content().strip()) bill.add_related_bill( identifier=companion_id, legislative_session=session, relation_type="companion", ) bill.add_source(bill_url) # Primary Sponsor sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']") [0].text_content().split("by")[-1]) sponsor = sponsor.replace("*", "").strip() if sponsor: bill.add_sponsorship(sponsor, classification="primary", entity_type="person", primary=True) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link("Current Version", btext.get("href"), media_type="application/pdf") # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link("Summary", summary[0].get("href")) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link("Fiscal Note", fiscal[0].get("href")) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_version_link( "Amendment " + amendment.text, amendment.get("href"), media_type="application/pdf", ) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link(afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore") # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = ( page.xpath("//span[@id='lblCompPrimeSponsor']") [0].text_content().split("by")[-1]) secondary_sponsor = (secondary_sponsor.replace("*", "").replace( ")", "").strip()) # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification="primary", entity_type="person", primary=True, ) # secondary actions if page.xpath("//table[@id='gvCoActionHistory']"): cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a["date"]) yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "https://legislature.idaho.gov/legislation/%s/" % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + "/" + session[0:4], "%m/%d/%Y").strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace("\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info("no session, using %s", session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = { "Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper", } # so presumably not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = { "approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False, } action_dict = { "ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, "infpass_217": "passage", } base_url = "https://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format( session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source( first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): ( spacer, number_link, _ga, title, primary_sponsor, status, spacer, ) = row.xpath("td") # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace("No.", "") bill_id = bill_id.replace(".", "").replace(" ", "") # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) title = title.text_content().strip() title = re.sub(r"^Title", "", title) chamber = "lower" if "H" in bill_id else "upper" classification = "bill" if "B" in bill_id else "resolution" if not title and session == "134" and bill_id == "HR 35": # Exception for HR 35 which is a real bill title = "No title provided" elif not title: self.warning(f"no title for {bill_id}, skipping") continue bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification, ) bill.add_source(number_link.xpath("a/@href")[0]) if (session, bill_id) in BAD_BILLS: self.logger.warning( f"Skipping details for known bad bill {bill_id}") yield bill continue # get bill from API bill_api_url = ( "https://search-prod.lis.state.oh.us/solarapi/v1/" "general_assembly_{}/{}/{}/".format( session, "bills" if "B" in bill_id else "resolutions", bill_id.lower().replace(" ", ""), )) data = self.get(bill_api_url, verify=False).json() if len(data["items"]) == 0: self.logger.warning( "Data for bill {bill_id} has empty 'items' array," " cannot process related information".format( bill_id=bill_id.lower().replace(" ", ""))) yield bill continue # add title if no short title if not bill.title: bill.title = data["items"][0]["longtitle"] bill.add_title(data["items"][0]["longtitle"], "long title") # this stuff is version-specific for version in data["items"]: version_name = version["version"] version_link = base_url + version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type="application/pdf") # we'll use latest bill_version for everything else bill_version = data["items"][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="primary", entity_type="person", primary=True, ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="cosponsor", entity_type="person", primary=False, ) try: action_doc = self.get(base_url + bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning( "Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize( datetime.datetime.strptime(action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url + bill_version["votes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) yield bill continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) yield bill continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime( data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action( effective_action, effective_date, chamber="executive", classification=["became-law"], ) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url + bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url + bill_version["disapprove"][0][ "link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError( "Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def parse_bill_status_page(self, url, page, list_sponsor, session): # list_sponsor passed in to support proposed bills (aka "unintroduced") which have "LC XXXX" bill numbers # see 2007 HB 2... weird. parsed_url = urllib.parse.urlparse(url) parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query)) if "P_BLTP_BILL_TYP_CD" in parsed_query: # normal bill bill_id = "{0} {1}".format(parsed_query["P_BLTP_BILL_TYP_CD"], parsed_query["P_BILL_NO1"]) elif "P_BILL_DFT_NO5" in parsed_query: # proposed bill ("unintroduced") bill_id = "{0} {1}".format( parsed_query["P_BILL_DFT_NO5"][0:2], parsed_query["P_BILL_DFT_NO5"][2:6].lstrip("0"), ) try: xp = '//b[text()="Short Title:"]/../following-sibling::td/text()' title = page.xpath(xp).pop() except IndexError: title = page.xpath("//tr[1]/td[2]")[0].text_content() # Add bill type. _bill_id = bill_id.lower() if "b" in _bill_id: classification = "bill" elif "j" in _bill_id or "jr" in _bill_id: classification = "joint resolution" elif "cr" in _bill_id: classification = "concurrent resolution" elif "r" in _bill_id: classification = "resolution" elif "lc" in _bill_id: classification = "proposed bill" # chamber if _bill_id[0] == "h": chamber = "lower" elif _bill_id[0] == "s": chamber = "upper" else: # fall back to using the sponsor's chamber # used for proposed bills aka unintroducd aka LC bills if " HD " in list_sponsor: chamber = "lower" if " SD " in list_sponsor: chamber = "upper" else: # a true fallback: some sponsors are organizations eg "Economic Affairs Interim Committee" chamber = "legislature" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification, ) self.add_actions(bill, page) votes = self.add_votes(bill, page, url) tabledata = self._get_tabledata(page) # Add sponsor info. if "primary sponsor:" in tabledata and tabledata["primary sponsor:"][0]: bill.add_sponsorship( tabledata["primary sponsor:"][0], classification="primary", entity_type="person", primary=True, ) elif "(" in list_sponsor: # use sponsor data from the bill listing, if it contains a party designation eg (R) # used for proposed bills aka unintroducd aka LC bills # grab everything before " (R) SD 30" in "John Esp (R) SD 30" sponsor_name_raw = re.search(r"(.+) \(", list_sponsor)[1] sponsor_name_raw = " ".join(sponsor_name_raw.split( )) # eliminate extra whitespace in middle of name parts if sponsor_name_raw: bill.add_sponsorship( sponsor_name_raw, classification="primary", entity_type="person", primary=True, ) elif "lc" in _bill_id: # probably the sponsor is an organization eg a committee, because LC bills can be sponsored by orgs # so just use the sponsor as listed from the index page if list_sponsor: bill.add_sponsorship( list_sponsor, classification="primary", entity_type="organization", primary=True, ) # A various plus fields MT provides. plus_fields = [ "requester", ("chapter number:", "chapter"), "transmittal date:", "drafter", "fiscal note probable:", "bill draft number:", "preintroduction required:", "by request of", "category:", ] for x in plus_fields: if isinstance(x, tuple): _key, key = x else: _key = key = x key = key.replace(" ", "_") try: val = tabledata[_key] except KeyError: continue if len(val) == 1: val = val[0] bill.extras[key] = val # Add bill subjects. xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr' subjects = [] for tr in page.xpath(xp): try: subj = tr.xpath("td")[0].text_content() except IndexError: continue subjects.append(subj) for s in subjects: bill.add_subject(s) self.add_fiscal_notes(page, bill) return bill, list(votes)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = "lower" if chamber.lower() == "house" else chamber chamber = "upper" if chamber.lower() == "senate" else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Check if bill hasn't been transmitted to the other chamber yet transmit_check = self.get_node( doc, '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()' ) if (transmit_check is not None and "has not been transmitted" in transmit_check.strip()): self.logger.debug("Bill has not been transmitted to other chamber " "... skipping {0}".format(bill_detail_url)) return # Get the basic parts of the bill bill_id = self.get_node( doc, '//h1[contains(@class,"card-title float-left mr-4")]/text()') self.logger.debug(bill_id) bill_title_text = self.get_node( doc, '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()' ) if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node( doc, '//a[text()[contains(.,"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node( long_desc_page, "//h1/" "following-sibling::p/text()") if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = "No title found." self.logger.warning("No title found for {}.".format(bill_id)) self.logger.debug(bill_title) bill_type = { "F": "bill", "R": "resolution", "C": "concurrent resolution" }[bill_id[1].upper()] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) # Add source bill.add_source(bill_detail_url) for subject in self._subject_mapping[bill_id]: bill.add_subject(subject) # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]' '/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) yield bill
def scrape(self, session=None): self._bill_prefix_map = { "HB": {"type": "bill", "url_segment": "bills/house"}, "HR": {"type": "resolution", "url_segment": "resolutions/house/simple"}, "HCR": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJR": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "HC": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJ": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "SB": {"type": "bill", "url_segment": "bills/senate"}, "SR": {"type": "resolution", "url_segment": "resolutions/senate/simple"}, "SCR": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJR": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, "SC": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJ": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, } api_base_url = "https://api.iga.in.gov" # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # we've put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] disp_bill_id = b["displayName"] bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue title = bill_json["description"] if title == "NoneNone": title = None # sometimes description is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ( "lower" if bill_json["originChamber"].lower() == "house" else "upper" ) bill_type = self._bill_prefix_map[bill_prefix]["type"] bill = Bill( disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type, ) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: self._add_sponsor_if_not_blank(bill, s, classification="author") for s in bill_json["coauthors"]: self._add_sponsor_if_not_blank(bill, s, classification="coauthor") for s in bill_json["sponsors"]: self._add_sponsor_if_not_blank(bill, s, classification="sponsor") for s in bill_json["cosponsors"]: self._add_sponsor_if_not_blank(bill, s, classification="cosponsor") # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get( "bill_actions", session=session, bill_id=bill_id.lower() ) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace("T", " ") # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if "second reading" in d or "reread second time" in d: action_type.append("reading-2") reading = True if "third reading" in d or "reread third time" in d: action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ( "referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d ): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if "vetoed by the governor" in d: action_type.append("executive-veto") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning( "Could not recognize an action in '{}'".format(action_desc) ) action_type = None a = bill.add_action( chamber=action_chamber, description=action_desc, date=date, classification=action_type, ) if committee: a.add_related_entity(committee, entity_type="organization") # subjects subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] for subject in subjects: bill.add_subject(subject) # Abstract if bill_json["latestVersion"]["digest"]: bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest") # put this behind a flag 2021-03-18 (openstates/issues#291) if not SCRAPE_WEB_VERSIONS: # votes yield from self._process_votes( bill_json["latestVersion"]["rollcalls"], disp_bill_id, original_chamber, session, ) # versions self.deal_with_version( bill_json["latestVersion"], bill, bill_id, original_chamber, session ) for version in bill_json["versions"][::-1]: self.deal_with_version( version, bill, bill_id, original_chamber, session, ) else: self.scrape_web_versions(session, bill, bill_id) yield bill
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//span[text()="Title"]')[0].getparent() short_title = doc.xpath('//span[text()="Short Title "]')[0].getparent() if len(title) > 1 and title[1].text: title = title[1].text.strip().strip('"') elif len(short_title) > 1 and short_title[1].text: self.warning("Falling back to short title on {}".format(url)) title = short_title[1].text.strip().strip('"') else: self.warning("skipping bill {}, no Title".format(url)) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = (doc.xpath('//span[contains(text(), "Sponsor(S)")]') [0].getparent()[1].text) # Checks if there is a Sponsor string before matching if spons_str: sponsors_match = re.match(r"(SENATOR|REPRESENTATIVE)", spons_str) if sponsors_match: sponsors = spons_str.split(",") sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0].split()[1], entity_type="person", classification="primary", primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type="person", classification="cosponsor", primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r" BY REQUEST OF THE GOVERNOR$", spons_str): spons_str = re.sub(r" BY REQUEST OF THE GOVERNOR$", "", spons_str).title() spons_str = spons_str + " Committee (by request of the governor)" if spons_str: bill.add_sponsorship( spons_str, entity_type="person", classification="primary", primary=True, ) # Get actions self._current_comm = None act_rows = doc.xpath("//div[@id='tab6_4']//tr")[1:] for row in act_rows: date, journal, action = row.xpath("td") action = action.text_content().strip() raw_chamber = action[0:3] journal_entry_number = journal.text_content() act_date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" # Votes if re.search(r"Y(\d+)", action): vote_href = journal.xpath(".//a/@href") if vote_href: vote_href = vote_href[0].replace(" ", "") yield from self.parse_vote( bill, journal_entry_number, action, act_chamber, act_date, vote_href, ) action, atype = self.clean_action(action) match = re.search(r"^Prefile released (\d+/\d+/\d+)$", action) if match: action = "Prefile released" act_date = datetime.datetime.strptime(match.group(1), "%m/%d/%y") bill.add_action( action, chamber=act_chamber, date=act_date.strftime("%Y-%m-%d"), classification=atype, ) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions - to do text_list_url = ( f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab1_4" ) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "/Text/")]'): name = link.text_content() text_url = link.get("href") bill.add_version_link(name, text_url, media_type="text/html") # Get documents - to do doc_list_url = ( f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab5_4" ) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) seen = set() for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib["href"] if h_name.strip() and h_href not in seen: bill.add_document_link(h_name, h_href) seen.add(h_href) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r"^(S|H)B ", bill_id): btype = ["bill"] elif re.match(r"(S|H)C ", bill_id): btype = ["commemoration"] elif re.match(r"(S|H)JR ", bill_id): btype = ["joint resolution"] elif re.match(r"(S|H)CR ", bill_id): btype = ["concurrent resolution"] else: btype = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype, ) bill.add_source(url) version_rows = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillVersions"]' + "/section/table/tbody/tr" ) assert len(version_rows) > 0 for row in version_rows: (date,) = row.xpath('./td[@data-title="Date"]/text()') date = date.strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() (html_note,) = row.xpath('./td[@data-title="HTML"]/a/text()') (html_link,) = row.xpath('./td[@data-title="HTML"]/a/@href') (pdf_note,) = row.xpath('./td[@data-title="PDF"]/a/text()') (pdf_link,) = row.xpath('./td[@data-title="PDF"]/a/@href') assert html_note == pdf_note note = html_note bill.add_version_link( note, html_link, date=date, media_type="text/html", on_duplicate="ignore", ) bill.add_version_link( note, pdf_link, date=date, media_type="application/pdf", on_duplicate="ignore", ) sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + "/following-sibling::div[1]/p/a" ) for link in sponsor_links: if link.attrib["href"].startswith("https://sdlegislature.gov/Legislators/"): sponsor_type = "person" elif link.attrib["href"].startswith( "https://sdlegislature.gov/Legislative_Session/Committees" ): sponsor_type = "organization" else: raise ScrapeError( "Found unexpected sponsor, URL: " + link.attrib["href"] ) bill.add_sponsorship( link.text, classification="primary", primary=True, entity_type=sponsor_type, ) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == "": self.debug("Skipping action table row that is completely empty") continue if "Date" in row.text_content() and "Action" in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith("First read"): atypes.append("introduction") atypes.append("reading-1") if re.match(r"Signed by (?:the\s)*Governor", action, re.IGNORECASE): atypes.append("executive-signature") actor = "executive" match = re.match(r"(.*) Do Pass( Amended)?, (Passed|Failed)", action) if match: if match.group(1) in ["Senate", "House of Representatives"]: first = "" else: first = "committee-" if match.group(3).lower() == "passed": second = "passage" elif match.group(3).lower() == "failed": second = "failure" atypes.append("%s%s" % (first, second)) if "referred to" in action.lower(): atypes.append("referral-committee") if "Motion to amend, Passed Amendment" in action: atypes.append("amendment-introduction") atypes.append("amendment-passage") if row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]'): amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0] version_name = amd.xpath("string(.)") version_url = amd.xpath("@href")[0] if "htm" in version_url: mimetype = "text/html" elif "pdf" in version_url: mimetype = "application/pdf" bill.add_version_link( version_name, version_url, media_type=mimetype, on_duplicate="ignore", ) if "Veto override, Passed" in action: atypes.append("veto-override-passage") elif "Veto override, Failed" in action: atypes.append("veto-override-failure") if "Delivered to the Governor" in action: atypes.append("executive-receipt") match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == "Senate": actor = "upper" else: actor = "lower" date = row.xpath("string(td[1])").strip() match = re.match(r"\d{2}/\d{2}/\d{4}", date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib["href"]) if action: bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def parse_bill_status_page(self, url, page, session, chamber): # see 2007 HB 2... weird. parsed_url = urllib.parse.urlparse(url) parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query)) bill_id = "{0} {1}".format( parsed_query["P_BLTP_BILL_TYP_CD"], parsed_query["P_BILL_NO1"] ) try: xp = '//b[text()="Short Title:"]/../following-sibling::td/text()' title = page.xpath(xp).pop() except IndexError: title = page.xpath("//tr[1]/td[2]")[0].text_content() # Add bill type. _bill_id = bill_id.lower() if "b" in _bill_id: classification = "bill" elif "j" in _bill_id or "jr" in _bill_id: classification = "joint resolution" elif "cr" in _bill_id: classification = "concurrent resolution" elif "r" in _bill_id: classification = "resolution" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification, ) self.add_actions(bill, page) votes = self.add_votes(bill, page, url) tabledata = self._get_tabledata(page) # Add sponsor info. bill.add_sponsorship( tabledata["primary sponsor:"][0], classification="primary", entity_type="person", primary=True, ) # A various plus fields MT provides. plus_fields = [ "requester", ("chapter number:", "chapter"), "transmittal date:", "drafter", "fiscal note probable:", "bill draft number:", "preintroduction required:", "by request of", "category:", ] for x in plus_fields: if isinstance(x, tuple): _key, key = x else: _key = key = x key = key.replace(" ", "_") try: val = tabledata[_key] except KeyError: continue if len(val) == 1: val = val[0] bill.extras[key] = val # Add bill subjects. xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr' subjects = [] for tr in page.xpath(xp): try: subj = tr.xpath("td")[0].text_content() except IndexError: continue subjects.append(subj) for s in subjects: bill.add_subject(s) self.add_fiscal_notes(page, bill) return bill, list(votes)