def scrape_bill(self, chamber, session, url): html = self.get(url).content page = lxml.html.fromstring(html) page.make_links_absolute(self.BASE_URL) if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'): bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()')[ 0 ].strip() elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'): bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/text()')[0].strip() else: self.warning("No bill id for {}".format(url)) return title = page.xpath( '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()' )[0].strip() if "B" in bill_id: _type = ["bill"] elif "J" in bill_id: _type = ["joint resolution"] elif "HS" in bill_id: _type = ["resolution"] else: raise ValueError("unknown bill type " + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type, ) bill.add_source(url) self.scrape_bill_subjects(bill, page) self.scrape_bill_sponsors(bill, page) self.scrape_bill_actions(bill, page) # fiscal note if page.xpath('//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'): fiscal_note = page.xpath( '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a' )[0] fiscal_url = fiscal_note.get("href") fiscal_title = fiscal_note.text_content() bill.add_document_link( fiscal_title, fiscal_url, media_type="application/pdf", ) # effective date, where available if page.xpath('//div[contains(text(), "Effective Date(s)")]'): eff_date = page.xpath('//div[contains(text(), "Effective Date(s)")]/text()')[0].strip() eff_date = eff_date.replace('Effective Date(s):', '').strip() # this can contain multiple dates, eg "July 1, 2020, July 1, 2022" bill.extras['date_effective'] = eff_date # yield from self.parse_bill_votes_new(doc, bill) yield bill
def scrape_prefiles(self, session): url = 'https://www.legis.iowa.gov/legislation/billTracking/prefiledBills' page = lxml.html.fromstring(self.get(url).content) page.make_links_absolute(url) for row in page.xpath('//table[contains(@class, "sortable")]/tr[td]'): title = row.xpath('td[2]/a/text()')[0].strip() url = row.xpath('td[2]/a/@href')[0] bill_id = self.extract_doc_id(title) bill = Bill( bill_id, legislative_session=session, chamber='legislature', title=title, classification='proposed bill', ) if (row.xpath('td[3]/a')): document_url = row.xpath('td[3]/a/@href')[0] if '.docx' in document_url: media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' elif '.pdf' in document_url: media_type = 'application/pdf' bill.add_document_link( note="Backround Statement", url=document_url, media_type=media_type ) bill.add_version_link( note="Prefiled", url=url, media_type="application/pdf" ) bill.add_source(url) yield bill
def scrape_bills(self, chamber_to_scrape, session): url = ( "http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml" % session) bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath("//LASTACTION/MSRGROUP"): bill_id = mr.xpath("string(MEASURE)").replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = { "B": "bill", "C": "concurrent resolution", "R": "resolution", "N": "nomination", }[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath("string(ACTIONLINK)").replace("..", "") main_doc = mr.xpath("string(MEASURELINK)").replace("../../../", "") main_doc_url = "http://billstatus.ls.state.ms.us/%s" % main_doc bill_details_url = "http://billstatus.ls.state.ms.us/%s/pdf%s" % ( session, link, ) try: details_page = self.get(bill_details_url) except scrapelib.HTTPError: self.warning( "Bill page not loading for {}; skipping".format(bill_id)) continue page = details_page.content # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath("string(//SHORTTITLE)") longtitle = details_root.xpath("string(//LONGTITLE)") if title == "": self.warning(f"No title yet for {bill_id}, skipping") return bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.extras["summary"] = longtitle bill.add_source(main_doc_url) # sponsors main_sponsor = details_root.xpath("string(//P_NAME)").split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath( "string(//P_LINK)").replace(" ", "_") main_sponsor_url = ("http://billstatus.ls.state.ms.us/%s/" "pdf/%s") % ( session, main_sponsor_link.strip("../"), ) type = "primary" bill.add_source(main_sponsor_url) bill.add_sponsorship( self.clean_voter_name(main_sponsor), classification=type, entity_type="person", primary=True, ) for author in details_root.xpath("//AUTHORS/ADDITIONAL"): leg = author.xpath("string(CO_NAME)").replace(" ", "_") if leg: leg_url = ("http://billstatus.ls.state.ms.us/%s/" "pdf/House_authors/%s.xml") % (session, leg) type = "cosponsor" bill.add_source(leg_url) bill.add_sponsorship( self.clean_voter_name(leg), classification=type, entity_type="person", primary=False, ) # Versions curr_version = details_root.xpath("string(//CURRENT_OTHER" ")").replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version_link( "Current version", curr_version_url, on_duplicate="ignore", media_type="text/html", ) curr_pdf_url = re.sub("html?", "pdf", curr_version_url) bill.add_version_link( "Current version", curr_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) intro_version = details_root.xpath( "string(//INTRO_OTHER)").replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version_link( "As Introduced", intro_version_url, on_duplicate="ignore", media_type="text/html", ) intro_pdf_url = re.sub("html?", "pdf", intro_version_url) bill.add_version_link( "As Introduced", intro_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) comm_version = details_root.xpath("string(//CMTESUB_OTHER" ")").replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version_link( "Committee Substitute", comm_version_url, on_duplicate="ignore", media_type="text/html", ) comm_pdf_url = re.sub("html?", "pdf", comm_version_url) bill.add_version_link( "Committee Substitute", comm_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) passed_version = details_root.xpath("string(//PASSED_OTHER" ")").replace( "../../../../", "") if passed_version.find("documents") != -1: passed_version_url = ("http://billstatus.ls.state.ms.us/" + passed_version) title = "As Passed the " + chamber bill.add_version_link( title, passed_version_url, on_duplicate="ignore", media_type="text/html", ) passed_pdf_url = re.sub("html?", "pdf", passed_version_url) bill.add_version_link( title, passed_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) asg_version = details_root.xpath("string(//ASG_OTHER)").replace( "../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version_link( "Approved by the Governor", asg_version_url, on_duplicate="ignore", media_type="text/html", ) asg_pdf_url = re.sub("html?", "pdf", asg_version_url) bill.add_version_link( "Approved by the Governor", asg_pdf_url, on_duplicate="ignore", media_type="application/pdf", ) # amendments # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml for amd in details_root.xpath("//AMENDMENTS/*"): if amd.tag == "HAM": name = amd.xpath("HAM_DESC[1]/text()")[0] name = append_parens(amd, "HAM_DISP", name) name = append_parens(amd, "HAM_VDESC", name) pdf_url = amd.xpath("string(HAM_PDF" ")").replace("../", "") html_url = amd.xpath("string(HAM_OTHER" ")").replace("../", "") elif amd.tag == "SAM": name = amd.xpath("SAM_DESC[1]/text()")[0] name = append_parens(amd, "SAM_DISP", name) name = append_parens(amd, "SAM_VDESC", name) pdf_url = amd.xpath("string(SAM_PDF" ")").replace("../", "") html_url = amd.xpath("string(SAM_OTHER" ")").replace("../", "") elif amd.tag == "AMRPT": name = amd.xpath("AMRPT_DESC[1]/text()")[0] pdf_url = amd.xpath("string(AMRPT_PDF" ")").replace("../", "") html_url = amd.xpath("string(AMRPT_OTHER" ")").replace("../", "") pdf_url = "http://billstatus.ls.state.ms.us/" + pdf_url html_url = "http://billstatus.ls.state.ms.us/" + html_url if "adopted" in name.lower( ) or "amendment report" in name.lower(): bill.add_version_link( name, pdf_url, on_duplicate="ignore", media_type="application/pdf", ) bill.add_version_link(name, html_url, on_duplicate="ignore", media_type="text/html") # avoid duplicate votes seen_votes = set() # Actions for action in details_root.xpath("//HISTORY/ACTION"): # action_num = action.xpath('string(ACT_NUMBER)').strip() # action_num = int(action_num) act_vote = action.xpath("string(ACT_VOTE)").replace( "../../../..", "") action_desc = action.xpath("string(ACT_DESC)") date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if "Veto" in action and actor == "executive": version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document_link("Veto", version_url) atype = "other" for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action( action, self._tz.localize(date), chamber=actor, classification=atype if atype != "other" else None, ) # use committee names as scraped subjects subjects = details_root.xpath("//H_NAME/text()") subjects += details_root.xpath("//S_NAME/text()") for subject in subjects: if subject not in bill.subject: bill.add_subject(subject) if act_vote: vote_url = "http://billstatus.ls.state.ms.us%s" % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) yield from self.scrape_votes(vote_url, action, date, actor, bill) bill.add_source(bill_details_url) yield bill
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.to_csv("MAINBILL.TXT") # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == "A": chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec["IdenticalBillNumber"].strip(): bill.add_related_bill( rec["IdenticalBillNumber"].split()[0], legislative_session=session, relation_type="companion", ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.to_csv("BILLSPON.TXT") for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in sponsor database" % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == "P": sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship( name, classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Documents bill_document_csv = self.to_csv("BILLWP.TXT") for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in document database" % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split("\\") document = document[-2] + "/" + document[-1] htm_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format( year_abr, document.replace(".DOC", ".HTM")) pdf_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format( year_abr, document.replace(".DOC", ".PDF")) # name document based _doctype try: doc_name = self._doctypes[rec["DocType"]] except KeyError: raise Exception("unknown doctype %s on %s" % (rec["DocType"], bill_id)) if rec["Comment"]: doc_name += " " + rec["Comment"] # Clean links. if htm_url.endswith("HTMX"): htm_url = re.sub("X$", "", htm_url) if pdf_url.endswith("PDFX"): pdf_url = re.sub("X$", "", pdf_url) if rec["DocType"] in self._version_types: if htm_url.lower().endswith("htm"): mimetype = "text/html" elif htm_url.lower().endswith("wpd"): mimetype = "application/vnd.wordperfect" try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) bill.add_version_link(doc_name, pdf_url, media_type="application/pdf") except ValueError: self.warning( "Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ "A%s" % year_abr, "A%s" % next_year, "S%s" % year_abr, "S%s" % next_year, "CA%s-%s" % (year_abr, next_year), "CS%s-%s" % (year_abr, next_year), ] # keep votes clean globally, a few votes show up in multiple files votes = {} for filename in vote_info_list: s_vote_url = f"https://www.njleg.state.nj.us/votes/{filename}.zip" try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.HTTPError: self.warning("could not find %s" % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"), encoding="latin-1") except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) if filename.startswith("A") or filename.startswith("CA"): chamber = "lower" else: chamber = "upper" if filename.startswith("C"): vote_file_type = "committee" else: vote_file_type = "chamber" for rec in vdict_file: if vote_file_type == "chamber": bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"]) leg = rec["Name"] # drop time portion date = rec["Agenda_Date"].split()[0] # make motion readable action = self._com_vote_motions[rec["BillAction"]] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec["LegislatorVote"][0:1] committee = rec["Committee_House"] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = "_".join(vote_parts).replace(" ", "_") if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification="passage", result=None, bill=bill_dict[bill_id], ) votes[vote_id].dedupe_key = vote_id if leg_vote == "Y": votes[vote_id].vote("yes", leg) elif leg_vote == "N": votes[vote_id].vote("no", leg) else: votes[vote_id].vote("other", leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count["option"]] += 1 vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) # Veto override. if vote.motion_text == "OVERRIDE": # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if "lower" in vote.bill: vote.result = "pass" if counts["yes"] >= 54 else "fail" elif "upper" in vote.bill: vote.result = "pass" if counts["yes"] >= 27 else "fail" else: # Regular vote. vote.result = "pass" if counts["yes"] > counts[ "no"] else "fail" vote.add_source("http://www.njleg.state.nj.us/downloads.asp") yield vote # Actions bill_action_csv = self.to_csv("BILLHIST.TXT") actor_map = {"A": "lower", "G": "executive", "S": "upper"} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in action database" % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = dateutil.parser.parse(date) actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += " " + comment bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.to_csv("BILLSUBJ.TXT") for rec in subject_csv: bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"])) if bill_id not in bill_dict: self.warning("unknown bill %s in subject database" % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec["SubjectKey"]) else: self.warning("invalid bill id in BillSubj: %s" % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning("probable phony bill detected %s", bill.identifier) phony_bill_count += 1 else: bill.add_source("http://www.njleg.state.nj.us/downloads.asp") yield bill if phony_bill_count: self.warning("%s total phony bills detected", phony_bill_count)
def scrape_bill(self, bill_num, session): chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = ( "http://wyoleg.gov/LsoService/api/BillInformation/{}/" "{}?calendarDate=".format(session, bill_num) ) if self.is_special: bill_json_url = ( "http://wyoleg.gov/LsoService/api/BillInformation/{}/" "{}?specialSessionValue=1&calendarDate=".format(session[0:4], bill_num) ) try: response = self.get(bill_json_url) bill_json = json.loads(response.content.decode("utf-8")) except scrapelib.HTTPError: return None chamber = "lower" if bill_json["bill"][0] else "upper" bill = Bill( identifier=bill_json["bill"], legislative_session=session, title=bill_json["catchTitle"], chamber=chamber, classification="bill", ) bill.add_title(bill_json["billTitle"]) source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format( session, bill_json["bill"] ) if self.is_special: source_url = "http://lso.wyoleg.gov/Legislation/{}/{}?specialSessionValue=1".format( session[0:4], bill_json["bill"] ) bill.add_source(source_url) for action_json in bill_json["billActions"]: utc_action_date = self.parse_local_date(action_json["statusDate"]) actor = None if action_json["location"] and action_json["location"] in chamber_map: actor = chamber_map[action_json["location"]] action = bill.add_action( chamber=actor, description=action_json["statusMessage"], date=utc_action_date, classification=categorize_action(action_json["statusMessage"]), ) action.extras = {"billInformationID": action_json["billInformationID"]} if bill_json["introduced"]: url = "http://wyoleg.gov/{}".format(bill_json["introduced"]) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["enrolledAct"]: url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"]) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["fiscalNote"]: url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"]) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["digest"]: url = "http://wyoleg.gov/{}".format(bill_json["digest"]) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["vetoes"]: for veto in bill_json["vetoes"]: url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"]) bill.add_version_link( note=veto["vetoLinkText"], url=url, media_type="application/pdf", # optional but useful! ) for amendment in bill_json["amendments"]: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf # TODO: There are no special session amendments yet, # but check this url format for specials url = "http://wyoleg.gov/{}/Amends/{}.pdf".format( session[0:4], amendment["amendmentNumber"] ) if amendment["sponsor"] and amendment["status"]: title = "Amendment {} ({}) - {} ({})".format( amendment["amendmentNumber"], amendment["order"], amendment["sponsor"], amendment["status"], ) else: title = "Amendment {} ({})".format( amendment["amendmentNumber"], amendment["order"] ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf" ) version["extras"] = { "amendmentNumber": amendment["amendmentNumber"], "sponsor": amendment["sponsor"], } for sponsor in bill_json["sponsors"]: status = "primary" if sponsor["primarySponsor"] else "cosponsor" sponsor_type = "person" if sponsor["sponsorTitle"] else "organization" bill.add_sponsorship( name=sponsor["name"], classification=status, entity_type=sponsor_type, primary=sponsor["primarySponsor"], ) if bill_json["summary"]: bill.add_abstract(note="summary", abstract=bill_json["summary"]) if bill_json["enrolledNumber"]: bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"] if bill_json["chapter"]: bill.extras["chapter"] = bill_json["chapter"] if bill_json["effectiveDate"]: eff = datetime.datetime.strptime(bill_json["effectiveDate"], "%m/%d/%Y") bill.extras["effective_date"] = eff.strftime("%Y-%m-%d") bill.extras["wy_bill_id"] = bill_json["id"] for vote_json in bill_json["rollCalls"]: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) for category in self._categories: leg_listing_url = ( self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}" ) resp = requests.post(leg_listing_url, headers=self._headers, verify=False,) resp.raise_for_status() leg_listing = resp.json() for leg in leg_listing: bill = Bill( leg["legislationNumber"], legislative_session=session, title=leg["title"], classification=category["name"], ) bill.add_source(leg_listing_url) bill_url = ( f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}" ) bill.add_source(bill_url) if leg['lawNumber']: bill.extras['lawNumber'] = leg['lawNumber'] # Actions for hist in leg["legislationHistory"]: hist_date = datetime.datetime.strptime( hist["actionDate"], "%b %d, %Y" ) hist_date = self._TZ.localize(hist_date) hist_action = hist["actionDescription"] if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]: hist_action = hist_action[5:] hist_class = self.classify_action(hist_action) if "mayor" in hist_action.lower(): actor = "executive" else: actor = "legislature" bill.add_action( hist_action, hist_date, classification=hist_class, chamber=actor ) # Documents with download links if hist["downloadURL"] and ("download" in hist["downloadURL"]): download = hist["downloadURL"] if not download.startswith("http"): download = "https://lims.dccouncil.us/" + download mimetype = ( "application/pdf" if download.endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in download.lower(): is_version = True doc_type = vt if "amendment" in download.lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, download, media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( hist["actionDescription"], download, media_type=mimetype, on_duplicate="ignore", ) # Grabs Legislation details leg_details_url = ( self._API_BASE_URL + f"LegislationDetails/{leg['legislationNumber']}" ) details_resp = requests.get( leg_details_url, headers=self._headers, verify=False, ) details_resp.raise_for_status() leg_details = details_resp.json() # Sponsors for i in leg_details["introducers"]: name = i["memberName"] bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True, ) # Co-sponsor if leg_details["coSponsors"]: for cs in leg_details["coSponsors"]: name = i["memberName"] bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=True, ) # Committee Hearing Doc for commHearing in leg_details["committeeHearing"]: if commHearing["hearingRecord"]: bill.add_document_link( commHearing["hearingType"], commHearing["hearingRecord"], media_type="application/pdf", on_duplicate="ignore", ) for committeeMarkup in leg_details["committeeMarkup"]: if committeeMarkup["committeeReport"]: bill.add_document_link( "Committee Markup", committeeMarkup["committeeReport"], media_type="application/pdf", on_duplicate="ignore", ) # Actions and Votes if leg_details["actions"]: # To prevent duplicate votes vote_ids = [] for act in leg_details["actions"]: action_name = act["action"] action_date = datetime.datetime.strptime( act["actionDate"][:10], "%Y-%m-%d" ) action_date = self._TZ.localize(action_date) if action_name.split()[0] == "Other": action_name = " ".join(action_name.split()[1:]) if "mayor" in action_name.lower(): actor = "executive" else: actor = "legislature" # Documents and Versions if act["attachment"]: mimetype = ( "application/pdf" if act["attachment"].endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in act["attachment"].lower(): is_version = True doc_type = vt if "amendment" in act["attachment"].lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) # Votes if act["voteDetails"]: result = act["voteDetails"]["voteResult"] if result: status = self._vote_statuses[result.lower()] id_text = ( str(leg["legislationNumber"]) + "-" + action_name + "-" + result ) if id_text not in vote_ids: vote_ids.append(id_text) action_class = self.classify_action(action_name) v = VoteEvent( identifier=id_text, chamber=actor, start_date=action_date, motion_text=action_name, result=status, classification=action_class, bill=bill, ) v.add_source(leg_listing_url) yes_count = ( no_count ) = absent_count = abstain_count = other_count = 0 for leg_vote in act["voteDetails"]["votes"]: mem_name = leg_vote["councilMember"] if leg_vote["vote"] == "Yes": yes_count += 1 v.yes(mem_name) elif leg_vote["vote"] == "No": no_count += 1 v.no(mem_name) elif leg_vote["vote"] == "Absent": absent_count += 1 v.vote("absent", mem_name) elif leg_vote["vote"] == "Recused": v.vote("abstain", mem_name) abstain_count += 1 elif leg_vote["vote"] == "Present": v.vote("other", mem_name) other_count += 1 else: # Incase anything new pops up other_count += 1 v.vote("other", mem_name) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("absent", absent_count) v.set_count("abstain", abstain_count) v.set_count("other", other_count) yield v yield bill
def scrape(self, session=None): HTML_TAGS_RE = r"<.*?>" if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format( year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)["data"] or [] bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format( year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)["data"] or []) resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format( year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)["data"] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info["BillNumber"].startswith("J.R.H."): bill_type = "joint resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("J.R.S."): bill_type = "joint resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.C.R."): bill_type = "concurrent resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.C.R."): bill_type = "concurrent resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.R."): bill_type = "resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.R."): bill_type = "resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("PR."): bill_type = "constitutional amendment" if info["Body"] == "H": bill_chamber = "lower" elif info["Body"] == "S": bill_chamber = "upper" else: raise AssertionError("Amendment not tied to chamber") elif info["BillNumber"].startswith("H."): bill_type = "bill" bill_chamber = "lower" elif info["BillNumber"].startswith("S."): bill_type = "bill" bill_chamber = "upper" else: raise AssertionError("Unknown bill type found: '{}'".format( info["BillNumber"])) bill_id_original_format = (info["BillNumber"].replace(".", "").replace( " ", "")) bill_id = bill_id_original_format # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info["Title"], classification=bill_type, ) if "resolution" in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format( year_slug, info["BillNumber"]) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' "following-sibling::dd[1]/ul/li") sponsor_type = "primary" for sponsor in sponsors: if sponsor.xpath("span/text()") == ["Additional Sponsors"]: sponsor_type = "cosponsor" continue sponsor_name = (sponsor.xpath("a/text()")[0].replace( "Rep.", "").replace("Sen.", "").strip()) if sponsor_name and not (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type="person", primary=(sponsor_type == "primary"), ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' "following-sibling::dd[1]/ul/li/a |" '//ul[@class="bill-path"]//a') for version in versions: if version.xpath("text()"): bill.add_version_link( note=version.xpath("text()")[0], url=version.xpath("@href")[0].replace(" ", "%20"), media_type="application/pdf", ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode("utf-8"), ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format( info["BillNumber"])) yield bill continue # Capture actions actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format( year_slug, internal_bill_id) actions_json = self.get(actions_url) # Checks if page actually has json posted if "json" in actions_json.headers.get("Content-Type"): actions = json.loads(actions_json.text)["data"] # Checks to see if any data is actually there if actions == "": continue else: continue bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action["FullStatus"]: actor = "executive" elif action["ChamberCode"] == "H": actor = "lower" elif action["ChamberCode"] == "S": actor = "upper" else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action["FullStatus"]: # assert chambers_passed == set("HS") action_type = "executive-signature" elif "Vetoed by the Governor" in action["FullStatus"]: action_type = "executive-veto" elif ("Read first time" in action["FullStatus"] or "Read 1st time" in action["FullStatus"]): action_type = "introduction" elif "Reported favorably" in action["FullStatus"]: action_type = "committee-passage-favorable" elif actor == "lower" and any( x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("H") elif actor == "upper" and any( x.lower().startswith(" aspassed") or x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("S") else: action_type = None # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.511 action["StatusDate"] = action["StatusDate"].replace( "/0209", "/2019") # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.754 if bill_id == "H 754" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0202", "/2020") # https://legislature.vermont.gov/bill/status/2020/H.942 if bill_id == "H 942" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0200", "/2020") action_date = datetime.datetime.strftime( datetime.datetime.strptime(action["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) # strftime doesn't always pad year value (%Y) (https://bugs.python.org/issue32195) # and sometimes this state has typos in year part of the StatusDate value # which can cause validation errors, so fix leading zeroes if they are missing if action_date.find("-") < 4: action_date = ("0" * (4 - action_date.find("-"))) + action_date bill.add_action( description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]), date=action_date, chamber=actor, classification=action_type, ) # Capture votes votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)["data"] bill.add_source(votes_url) for vote in votes: roll_call_id = vote["VoteHeaderID"] roll_call_url = ("http://legislature.vermont.gov/bill/" "loadBillRollCallDetails/{0}/{1}".format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)["data"] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member["MemberName"].split(" of ") member_name = member_name.strip() if member["MemberVote"] == "Yea": roll_call_yea.append(member_name) elif member["MemberVote"] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote["FullStatus"] # seems like we've seen both or "Governor overridden" in vote["FullStatus"] or "Governor overriden" in vote["FullStatus"]): did_pass = True elif ("Failed -- " in vote["FullStatus"] or "Veto of the Governor sustained" in vote["FullStatus"]): did_pass = False else: raise AssertionError("Roll call vote result is unclear: " + vote["FullStatus"]) # Check vote counts yea_count = int( re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1)) nay_count = int( re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1)) vote_start_date = datetime.datetime.strftime( datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) motion_text = re.sub(HTML_TAGS_RE, "", vote["FullStatus"]).strip() vote_identifer = (vote["StatusDate"] + "--" + motion_text + "--" + roll_call_url) vote_to_add = VoteEvent( identifier=vote_identifer, bill=bill, chamber=("lower" if vote["ChamberCode"] == "H" else "upper"), start_date=vote_start_date, motion_text=motion_text, result="pass" if did_pass else "fail", classification="passage", legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count("yes", yea_count) vote_to_add.set_count("no", nay_count) vote_to_add.set_count("not voting", len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote("not voting", member) yield vote_to_add # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format( bill_id_original_format) bill.add_document_link(note="Witness List", url=witnesses_doc_link_url, media_type="text/html") # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format( bill_id_original_format) page = self.lxmlize(conferees_doc_link_url) no_data = page.xpath('//div[@class="no-data"]/text()') if not no_data: bill.add_document_link( note="Conference Committee Members", url=conferees_doc_link_url, media_type="text/html", ) # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format( bill_id_original_format) bill.add_document_link( note="Committee Meetings", url=meetings_doc_link_url, media_type="text/html", ) yield bill
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = "%s/%s" % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace("Bill.aspx", "BillContent.aspx") url = url.replace("&code=R", "&code=R&style=new") # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath("//table/tr") # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == "Co-Sponsor:": cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == "LR Number:" # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if (table_rows[4 + cosponsorOffset][0].text_content().strip() == "Governor Action:"): lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == "Bill String:" official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = "Appropriations Bill" else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title)) return bill = Bill( bill_id, chamber="lower", title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note="official") bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship(bill_sponsor, entity_type="person", classification="primary", primary=True) # check for cosponsors (sponsors_url, ) = bill_page.xpath("//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) (actions_link, ) = bill_page.xpath("//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = "%s%s" % (self._house_base_url, doc_tag[0].attrib["href"]) bill.add_document_link(doc, text_url, media_type="text/html") # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == "PDF": mimetype = "application/pdf" else: mimetype = "text/html" bill.add_version_link( version, vurl.attrib["href"], media_type=mimetype, on_duplicate="ignore", ) # house bill versions # everything between the row containing "Bill Text" in an h2 and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Text")]]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() if ".pdf" in path: mimetype = "application/pdf" else: mimetype = "text/html" bill.add_version_link(version, path, media_type=mimetype, on_duplicate="ignore") # house bill summaries # everything between the row containing "Bill Summary" in an h2 # and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Summary")]]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Summary")]]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = "Bill Summary ({})".format(version) if ".pdf" in path: mimetype = "application/pdf" else: mimetype = "text/html" bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate="ignore") # house bill amendments amendment_rows = bill_page.xpath( '//div[h2[contains(text(),"Amendment")]]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip( ) path = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip( ) summary_name = "Amendment {}".format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = "{} (Defeated)".format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = "{} (Adopted)".format(summary_name) distributed_icon = row.xpath( './/img[contains(@title,"Distributed")]') if distributed_icon: summary_name = "{} (Distributed)".format(summary_name) if ".pdf" in path: mimetype = "application/pdf" else: mimetype = "text/html" bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate="ignore") yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[:4], bill_id.replace(" ", "-"), ) html = self.get(url).text # Otherwise, try second year of the session biennium if ( "Page Not Found" in html or "The bill you are looking for is not available yet" in html ): url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[-4:], bill_id.replace(" ", "-"), ) html = self.get(url).text if ( "Page Not Found" in html or "The bill you are looking for is not available yet" in html ): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute("http://legislature.mi.gov") title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[ 0 ].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(" ")[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u"\xa0", " ") # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( "primary" if sponsor.tail and "primary" in sponsor.tail else "cosponsor" ) else: classification = "primary" bill.add_sponsorship( name=name.strip(), chamber=chamber, entity_type="person", primary=classification == "primary", classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath("td") # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() try: date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%y")) except ValueError: try: date = TIMEZONE.localize( datetime.datetime.strptime(date, "%m/%d/%Y") ) except ValueError: self.warning( "{} has action with invalid date. Skipping Action".format( bill_id ) ) continue # use journal for actor # then fall back to upper/lower case # Journal entries are often posted with 'Expected Soon' as the cite, # then changed to the journal entry. if "SJ" in journal.upper(): actor = "upper" elif "HJ" in journal.upper(): actor = "lower" elif action.split()[0].islower(): actor = "lower" elif action.split()[0].isupper(): actor = "upper" else: actor = "legislature" classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search( r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE ) if submatch and tds[2].xpath("a"): version_url = tds[2].xpath("a/@href")[0] version_name = tds[2].xpath("a/text()")[0].strip() version_name = "Substitute {}".format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith(".pdf"): mimetype = "application/pdf" elif version_url.lower().endswith(".htm"): mimetype = "text/html" bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath("a/@href") if journal_link: objectname = journal_link[0].rsplit("=", 1)[-1] chamber_name = {"upper": "Senate", "lower": "House"}[actor] vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % ( session, chamber_name, objectname, ) results = self.parse_roll_call(vote_url, rc_num, session) if results is not None: vote_passed = len(results["yes"]) > len(results["no"]) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result="pass" if vote_passed else "fail", classification="passage", ) # check the expected counts vs actual count = re.search(r"YEAS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["yes"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["yes"])) ) count = re.search(r"NAYS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["no"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["no"])) ) vote.set_count("yes", len(results["yes"])) vote.set_count("no", len(results["no"])) vote.set_count("other", len(results["other"])) possible_vote_results = ["yes", "no", "other"] for pvr in possible_vote_results: for name in results[pvr]: if session == "2017-2018": names = name.split("\t") for n in names: vote.vote(pvr, name.strip()) else: # Prevents voter names like "House Bill No. 4451, entitled" and other sentences if len(name.split()) < 5: vote.vote(pvr, name.strip()) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith(".pdf"): mimetype = "application/pdf" elif url.endswith(".htm"): mimetype = "text/html" bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if ".B. " in bill_id: bill_type = "bill" elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "): bill_type = "resolution" elif ".C.R. " in bill_id: bill_type = "concurrent resolution" elif ".J.R. " in bill_id: bill_type = "joint resolution" for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub(r"\s+", " ", bill_id).strip().replace(".", "") bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: try: (title, name) = [ x.strip() for x in info.xpath(".//text()") if x.strip() ] except ValueError: self.warning( "Could not find sponsor's name for {}".format(bill_id)) continue assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification="primary", entity_type="person", primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship( floor_sponsor, classification="cosponsor", entity_type="person", primary=False, ) else: self.warning("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]') for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get("href") if not url: url = version.xpath("following-sibling::a[1]/@href")[0] bill.add_version_link(version.xpath("text()")[0].strip(), url, media_type="application/pdf") for related in page.xpath( '//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath("@href")[0] if ".fn.pdf" in href: bill.add_document_link("Fiscal Note", href, media_type="application/pdf") else: text = related.xpath("text()")[0] bill.add_document_link(text, href, media_type="application/pdf") subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects if page.xpath('//div[@id="billStatus"]//table'): status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = "%s/GetLegislation?biennium=%s&billNumber" "=%s" % ( self._base_url, self.biennium, bill_num, ) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] xml_chamber = xpath(page, "string(wa:OriginalAgency)") chamber = self._chamber_map[xml_chamber] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)" ) bill_type = bill_type.lower() if bill_type == "gubernatorial appointment": return bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type], ) fake_source = ( "http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4]) ) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link( note=version["note"], url=version["url"], media_type=version["media_type"], ) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link( note=document["note"], url=document["url"], media_type=document["media_type"], ) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, chamber, fake_source) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def test_full_bill(): create_jurisdiction() person = Person.objects.create(name="Adam Smith") lower = Organization.objects.create(jurisdiction_id="jid", name="House", classification="lower") Membership.objects.create(person_id=person.id, organization_id=lower.id) Organization.objects.create( jurisdiction_id="jid", name="Arbitrary Committee", classification="committee", parent=lower, ) oldbill = ScrapeBill( "HB 99", "1899", "Axe & Tack Tax Act", classification="tax bill", chamber="lower", ) bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower") bill.subject = ["taxes", "axes"] bill.add_identifier("SB 9") bill.add_title("Tack & Axe Tax Act") bill.add_action("introduced in house", "1900-04-01", chamber="lower") act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower") act.add_related_entity( "arbitrary committee", "organization", _make_pseudo_id(name="Arbitrary Committee"), ) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship( "Adam Smith", classification="extra sponsor", entity_type="person", primary=False, entity_id=_make_pseudo_id(name="Adam Smith"), ) bill.add_sponsorship("Jane Smith", classification="lead sponsor", entity_type="person", primary=True) bill.add_abstract( "This is an act about axes and taxes and tacks.", note="official", date="1969-10-20", ) bill.add_document_link("Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf") bill.add_document_link("Fiscal Note", "http://example.com/fn.html", media_type="text/html") bill.add_version_link("Fiscal Note", "http://example.com/v/1", media_type="text/html") bill.add_source("http://example.com/source") # import bill BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier="HB 1") assert b.from_organization.classification == "lower" assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ["taxes", "axes"] assert b.abstracts.get().note == "official" assert b.abstracts.get().date == "1969-10-20" # other_title, other_identifier added assert b.other_titles.get().title == "Tack & Axe Tax Act" assert b.other_identifiers.get().identifier == "SB 9" # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification="lower") assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert actions[1].related_entities.get( ).organization == Organization.objects.get(classification="committee") # action computed fields assert b.first_action_date == "1900-04-01" assert b.latest_action_date == "1900-04-04" assert b.latest_action_description == "sent to arbitrary committee" # related_bills were added rb = b.related_bills.get() assert rb.identifier == "HB 99" # and bill got resolved assert rb.related_bill.identifier == "HB 99" # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name="Adam Smith") for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["identifier"] self.info("no session specified, using %s", session) chamber_types = { "H": "lower", "S": "upper", "G": "executive", "C": "legislature", } session_id = SESSION_SITE_IDS[session] self._url_base += session_id + "/" bill_url_base = "https://lis.virginia.gov/cgi-bin/" self.load_members() self.load_sponsors() self.load_amendments() self.load_history() self.load_summaries() self.load_votes() self.load_bills() for bill in self._bills: bill = self._bills[bill][0] bill_id = bill["bill_id"] chamber = chamber_types[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] b = Bill( bill_id, session, bill["bill_description"], chamber=chamber, classification=bill_type, ) bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}" b.add_source(bill_url) # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries. # Fill in blanks with 0s long_bill_id = bill_id if len(bill_id) == 3: long_bill_id = bill_id[0:2] + "000" + bill_id[-1] elif len(bill_id) == 4: long_bill_id = bill_id[0:2] + "00" + bill_id[-2:] elif len(bill_id) == 5: long_bill_id = bill_id[0:2] + "0" + bill_id[-3:] # Sponsors for spon in self._sponsors[long_bill_id]: sponsor_type = spon["patron_type"] if sponsor_type.endswith("Chief Patron"): sponsor_type = "primary" else: sponsor_type = "cosponsor" b.add_sponsorship( spon["member_name"], classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Summary summary_texts = self._summaries[long_bill_id] for sum_text in summary_texts: b.add_abstract(sum_text["summary_text"], sum_text["summary_type"]) # Amendment docs amendments = self._amendments[bill_id] for amend in amendments: doc_link = ( bill_url_base + f"legp604.exe?{session_id}+amd+{amend['txt_docid']}") b.add_document_link("Amendment: " + amend["txt_docid"], doc_link, media_type="text/html") # Action text is used to improve version text actions_text = [] # History and then votes for hist in self._history[bill_id]: action = hist["history_description"] action_date = hist["history_date"] date = datetime.datetime.strptime(action_date, "%m/%d/%y").date() chamber = chamber_types[action[0]] vote_id = hist["history_refid"] cleaned_action = action[2:] actions_text.append(cleaned_action) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, cleaned_action): break else: atype = None if atype != SKIP: b.add_action(cleaned_action, date, chamber=chamber, classification=atype) if len(vote_id) > 0: total_yes = 0 total_no = 0 total_not_voting = 0 total_abstain = 0 for v in self._votes[vote_id]: if v["vote_result"] == "yes": total_yes += 1 elif v["vote_result"] == "no": total_no += 1 elif v["vote_result"] == "not voting": total_not_voting += 1 elif v["vote_result"] == "abstain": total_abstain += 1 vote = VoteEvent( identifier=vote_id, start_date=date, chamber=chamber, motion_text=cleaned_action, result="pass" if total_yes > total_no else "fail", classification="passage", bill=b, ) vote.set_count("yes", total_yes) vote.set_count("no", total_no) vote.set_count("not voting", total_not_voting) vote.set_count("abstain", total_abstain) vote_url = ( bill_url_base + f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}" ) vote.add_source(vote_url) for v in self._votes[vote_id]: vote.vote(v["vote_result"], v["member_id"]) yield vote # Versions for version in bill["text_docs"]: # Checks if abbr is blank as not every bill has multiple versions if len(version["doc_abbr"]) > 0: version_url = ( bill_url_base + f"legp604.exe?{session_id}+ful+{version['doc_abbr']}") version_date = datetime.datetime.strptime( version["doc_date"], "%m/%d/%y").date() version_text = version["doc_abbr"] for act in actions_text: if version_text in act: version_text = act b.add_version_link( version_text, version_url, date=version_date, media_type="text/html", on_duplicate="ignore", ) yield b
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//span[text()="Title"]')[0].getparent() short_title = doc.xpath('//span[text()="Short Title "]')[0].getparent() if len(title) > 1 and title[1].text: title = title[1].text.strip().strip('"') elif len(short_title) > 1 and short_title[1].text: self.warning("Falling back to short title on {}".format(url)) title = short_title[1].text.strip().strip('"') else: self.warning("skipping bill {}, no Title".format(url)) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = (doc.xpath('//span[contains(text(), "Sponsor(S)")]') [0].getparent()[1].text) # Checks if there is a Sponsor string before matching if spons_str: sponsors_match = re.match(r"(SENATOR|REPRESENTATIVE)", spons_str) if sponsors_match: sponsors = spons_str.split(",") sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0].split()[1], entity_type="person", classification="primary", primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type="person", classification="cosponsor", primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r" BY REQUEST OF THE GOVERNOR$", spons_str): spons_str = re.sub(r" BY REQUEST OF THE GOVERNOR$", "", spons_str).title() spons_str = spons_str + " Committee (by request of the governor)" if spons_str: bill.add_sponsorship( spons_str, entity_type="person", classification="primary", primary=True, ) # Get actions self._current_comm = None act_rows = doc.xpath("//div[@id='tab6_4']//tr")[1:] for row in act_rows: date, journal, action = row.xpath("td") action = action.text_content().strip() raw_chamber = action[0:3] journal_entry_number = journal.text_content() act_date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" # Votes if re.search(r"Y(\d+)", action): vote_href = journal.xpath(".//a/@href") if vote_href: vote_href = vote_href[0].replace(" ", "") yield from self.parse_vote( bill, journal_entry_number, action, act_chamber, act_date, vote_href, ) action, atype = self.clean_action(action) match = re.search(r"^Prefile released (\d+/\d+/\d+)$", action) if match: action = "Prefile released" act_date = datetime.datetime.strptime(match.group(1), "%m/%d/%y") bill.add_action( action, chamber=act_chamber, date=act_date.strftime("%Y-%m-%d"), classification=atype, ) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions - to do text_list_url = ( f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab1_4" ) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "/Text/")]'): name = link.text_content() text_url = link.get("href") bill.add_version_link(name, text_url, media_type="text/html") # Get documents - to do doc_list_url = ( f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab5_4" ) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) seen = set() for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib["href"] if h_name.strip() and h_href not in seen: bill.add_document_link(h_name, h_href) seen.add(h_href) yield bill
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["identifier"] self.info("no session specified, using %s", session) chamber_types = { "H": "lower", "S": "upper", "G": "executive", "C": "legislature", } # pull the current session's details to tell if it's a special session_details = next( each for each in self.jurisdiction.legislative_sessions if each["identifier"] == session) is_special = False if ("classification" in session_details and session_details["classification"] == "special"): is_special = True session_id = SESSION_SITE_IDS[session] self.init_sftp(session_id) bill_url_base = "https://lis.virginia.gov/cgi-bin/" if not is_special: self.load_members() self.load_sponsors() self.load_fiscal_notes() self.load_summaries() self.load_history() self.load_votes() self.load_bills() if not is_special: self.load_amendments() for bill in self._bills: bill = self._bills[bill][0] bill_id = bill["bill_id"] chamber = chamber_types[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] b = Bill( bill_id, session, bill["bill_description"], chamber=chamber, classification=bill_type, ) bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}" b.add_source(bill_url) # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries. # Fill in blanks with 0s long_bill_id = bill_id if len(bill_id) == 3: long_bill_id = bill_id[0:2] + "000" + bill_id[-1] elif len(bill_id) == 4: long_bill_id = bill_id[0:2] + "00" + bill_id[-2:] elif len(bill_id) == 5: long_bill_id = bill_id[0:2] + "0" + bill_id[-3:] # Sponsors if long_bill_id not in self._sponsors: if "patron_name" in bill and bill["patron_name"].strip() != "": b.add_sponsorship( bill["patron_name"], classification="primary", entity_type="person", primary=True, ) for spon in self._sponsors[long_bill_id]: if spon["member_name"].strip() == "": continue sponsor_type = spon["patron_type"] if sponsor_type.endswith("Chief Patron"): sponsor_type = "primary" else: sponsor_type = "cosponsor" b.add_sponsorship( spon["member_name"], classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Summary summary_texts = self._summaries[long_bill_id] for sum_text in summary_texts: b.add_abstract(sum_text["summary_text"], sum_text["summary_type"]) # Amendment docs amendments = self._amendments[bill_id] for amend in amendments: doc_link = ( bill_url_base + f"legp604.exe?{session_id}+amd+{amend['txt_docid']}") b.add_document_link("Amendment: " + amend["txt_docid"], doc_link, media_type="text/html") # fiscal notes for fn in self._fiscal_notes[long_bill_id]: doc_link = bill_url_base + f"legp604.exe?{session_id}+oth+{fn['refid']}" b.add_document_link( "Fiscal Impact Statement: " + fn["refid"], doc_link.replace(".PDF", "+PDF"), media_type="application/pdf", ) # actions with 8-digit number followed by D are version titles too doc_actions = defaultdict(list) # History and then votes for hist in self._history[bill_id]: action = hist["history_description"] action_date = hist["history_date"] date = datetime.datetime.strptime(action_date, "%m/%d/%y").date() chamber = chamber_types[action[0]] vote_id = hist["history_refid"] cleaned_action = action[2:] if re.findall(r"\d{8}D", cleaned_action): doc_actions[action_date].append(cleaned_action) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, cleaned_action): break else: atype = None if atype != SKIP: b.add_action(cleaned_action, date, chamber=chamber, classification=atype) if len(vote_id) > 0: total_yes = 0 total_no = 0 total_not_voting = 0 total_abstain = 0 for v in self._votes[vote_id]: if v["vote_result"] == "yes": total_yes += 1 elif v["vote_result"] == "no": total_no += 1 elif v["vote_result"] == "not voting": total_not_voting += 1 elif v["vote_result"] == "abstain": total_abstain += 1 vote = VoteEvent( identifier=vote_id, start_date=date, chamber=chamber, motion_text=cleaned_action, result="pass" if total_yes > total_no else "fail", classification="passage", bill=b, ) vote.set_count("yes", total_yes) vote.set_count("no", total_no) vote.set_count("not voting", total_not_voting) vote.set_count("abstain", total_abstain) vote_url = ( bill_url_base + f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}" ) vote.add_source(vote_url) for v in self._votes[vote_id]: vote.vote(v["vote_result"], v["member_id"]) yield vote # Versions for version in bill["text_docs"]: # Checks if abbr is blank as not every bill has multiple versions if version["doc_abbr"]: version_url = ( bill_url_base + f"legp604.exe?{session_id}+ful+{version['doc_abbr']}") version_date = datetime.datetime.strptime( version["doc_date"], "%m/%d/%y").date() # version text will default to abbreviation provided in CSV # but if there is an unambiguous action from that date with # a version, we'll use that as the document title version_text = version["doc_abbr"] if len(doc_actions[version["doc_date"]]) == 1: version_text = doc_actions[version["doc_date"]][0] b.add_version_link( version_text, version_url, date=version_date, media_type="text/html", on_duplicate="ignore", ) yield b
def bill_info(self, bill_link, session, main_url): bill_page = self.lxmlize(bill_link) long_title = self.get_node( bill_page, '//div[@class="main-content"]//h2').text.split() bill_number = long_title[0] title = "" for x in range(2, len(long_title)): title += long_title[x] + " " title = title[0:-1] if not title: self.error("no title, skipping %s", bill_number) return bill_type = "resolution" if "LR" in bill_number else "bill" bill = Bill(bill_number, session, title, classification=bill_type) bill.add_source(main_url) bill.add_source(bill_link) introduced_by = self.get_node( bill_page, "//body/div[3]/div[2]/div[2]/div/div[3]/div[1]/ul/li[1]/a[1]/text()", ) if not introduced_by: introduced_by = self.get_node( bill_page, "//body/div[3]/div[2]/div[2]/div/div[2]/div[1]/ul/li[1]/text()", ) introduced_by = introduced_by.split("Introduced By:")[1].strip() introduced_by = introduced_by.strip() bill.add_sponsorship( name=introduced_by, entity_type="person", primary=True, classification="primary", ) action_nodes = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr') for action_node in action_nodes: date = self.get_node(action_node, "./td[1]").text date = datetime.strptime(date, "%b %d, %Y") # The action node may have an anchor element within it, so # we grab all the text within. action = self.get_node(action_node, "./td[2]").text_content() if "Governor" in action: actor = "executive" elif "Speaker" in action: actor = "legislature" else: actor = "legislature" action_type = self.action_types(action) bill.add_action( action, date.strftime("%Y-%m-%d"), chamber=actor, classification=action_type, ) # Grabs bill version documents. version_links = self.get_nodes( bill_page, "/html/body/div[3]/div[2]/div[2]/div/" "div[3]/div[2]/ul/li/a") for version_link in version_links: version_name = version_link.text version_url = version_link.attrib["href"] # replace Current w/ session number version_url = version_url.replace("Current", session) bill.add_version_link(version_name, version_url, media_type="application/pdf") soi = self.get_nodes(bill_page, ".//a[contains(text(), 'Statement of Intent')]") if soi: bill.add_document_link("Statement of Intent", soi[0].get("href"), media_type="application/pdf") comstmt = self.get_nodes( bill_page, ".//a[contains(text(), 'Committee Statement')]") if comstmt: bill.add_document_link( "Committee Statement", comstmt[0].get("href"), media_type="application/pdf", ) fn = self.get_nodes(bill_page, ".//a[contains(text(), 'Fiscal Note')]") if fn: bill.add_document_link("Fiscal Note", fn[0].get("href"), media_type="application/pdf") # Adds any documents related to amendments. amendment_links = self.get_nodes( bill_page, ".//div[contains(@class, 'amend-link')]/a") for amendment_link in amendment_links: amendment_name = amendment_link.text amendment_url = amendment_link.attrib["href"] # skip over transcripts if "/AM/" not in amendment_url: continue bill.add_document_link(amendment_name, amendment_url, media_type="application/pdf") yield bill yield from self.scrape_votes(bill, bill_page, actor)
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return withdrawn = False if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) withdrawn = True if withdrawn: title = "Withdrawn." else: title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) self.parse_versions(page, bill) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) # only grab links in the first table, because proposed amendments have sponsors that are not bill sponsors. for link in page.xpath( "//div[contains(@class,'bill-table')][1]//td/span/a[contains(@href, 'Legislator-Profile')]" ): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr if self.parse_bill_field(page, "Summary of Original Version") != "": summary = ( self.parse_bill_field(page, "Summary of Original Version") .text_content() .strip() ) bill.add_abstract(summary, note="Summary of Original Version") if withdrawn: action = self.parse_bill_field(page, "Last Action").text_content().strip() wd_date = re.findall(r"\d{2}\/\d{2}\/\d+", action)[0] wd_date = dateutil.parser.parse(wd_date).date() bill.add_action( action, wd_date, chamber=chamber, classification="withdrawal" ) yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if not title: self.warning("blank bill on %s - skipping", url) return if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if "author not found" in name.lower(): continue if ":" in name: raise Exception(name) if "otherAuth" in link.attrib["id"]: bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship(name, classification="primary", entity_type="person", primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs["committees"]: related_entities.append({"type": "committee", "name": item}) for item in attrs["legislators"]: related_entities.append({"type": "legislator", "name": item}) bill.add_action( description=action, date=date.strftime("%Y-%m-%d"), chamber=actor, classification=attrs["classification"], related_entities=related_entities, ) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib["href"] if version_url in version_urls: self.warning("Skipping duplicate version URL.") continue else: version_urls.append(version_url) name = link.text.strip() if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type="application/pdf") continue bill.add_version_link(note=name, url=version_url, media_type="application/pdf") self.scrape_amendments(bill, page) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if "HT_" not in link.attrib["href"]: yield from self.scrape_votes( bill, self.urlescape(link.attrib["href"])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = bill.title == "Short Title Not Found." if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation): page = self.lxmlize(bill_url) author = self.get_one_xpath( page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()") def sbp(x): return self.scrape_bare_page( page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"]) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] try: amendments = sbp("Amendments") except IndexError: amendments = [] title = page.xpath( "//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] title = title.replace("\u00a0\u00a0", " ") actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr") bill_id = page.xpath( "//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = self._bill_types[bill_abbreviation[1:]] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(bill_url) authors.remove(author) bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for author in authors: bill.add_sponsorship(author, classification="cosponsor", entity_type="person", primary=False) for digest in digests: bill.add_document_link( note=digest.text, url=digest.attrib["href"], media_type="application/pdf", ) for version in versions: bill.add_version_link( note=version.text, url=version.attrib["href"], media_type="application/pdf", ) for amendment in amendments: if "href" in amendment.attrib: bill.add_version_link( note=amendment.text, url=amendment.attrib["href"], media_type="application/pdf", ) flags = { "prefiled": ["filing"], "referred to the committee": ["referral-committee"], "sent to the house": ["passage"], "ordered returned to the house": ["passage"], "ordered to the senate": ["passage"], "signed by the governor": ["executive-signature"], "sent to the governor": ["executive-receipt"], } try: votes_link = page.xpath("//a[text() = 'Votes']")[0] yield from self.scrape_votes(bill, votes_link.attrib["href"]) except IndexError: # Some bills don't have any votes pass for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] session_year = self.jurisdiction.legislative_sessions[-1][ "start_date"][0:4] # Session is April -> June. Prefiles look like they're in # January at earliest. date += "/{}".format(session_year) date = dt.datetime.strptime(date, "%m/%d/%Y") chamber = self._chambers[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] bill.add_action( description=text, date=date.strftime("%Y-%m-%d"), chamber=chamber, classification=cat, ) yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type="text/html", ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type="text/html", ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue introduced = False if desc == "Amended": atype = "amendment-passage" elif desc == "Amendment(s) offered": atype = "amendment-introduction" elif desc == "Amendment amended": atype = "amendment-amendment" elif desc == "Amendment withdrawn": atype = "amendment-withdrawal" elif desc == "Passed" or desc == "Adopted": atype = "passage" elif re.match(r"^Received (by|from) the", desc): if "Secretary of the Senate" not in desc: atype = "introduction" else: atype = "filing" elif desc.startswith("Sent to the Governor"): # But what if it gets lost in the mail? atype = "executive-receipt" elif desc.startswith("Signed by the Governor"): atype = "executive-signature" elif desc.startswith("Effective on"): atype = "became-law" elif desc == "Vetoed by the Governor": atype = "executive-veto" elif desc == "Read first time": atype = ["introduction", "reading-1"] introduced = True elif desc == "Read & adopted": atype = ["passage"] if not introduced: introduced = True atype.append("introduction") elif desc == "Passed as amended": atype = "passage" elif desc.startswith("Referred to") or desc.startswith( "Recommended to be sent to "): atype = "referral-committee" elif desc == "Reported favorably w/o amendment(s)": atype = "committee-passage" elif desc == "Filed": atype = "filing" elif desc == "Read 3rd time": atype = "reading-3" elif desc == "Read 2nd time": atype = "reading-2" elif desc.startswith("Reported favorably"): atype = "committee-passage-favorable" else: atype = None act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if "*" in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(" ", " ") bill_id = bill_id.replace("*", "").replace(" ", " ").strip() if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" primary_chamber = "lower" if "H" in bill_id else "upper" # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = "%s detail page was missing title info." self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find("-") subjects = [s.strip() for s in title[:subject_pos - 1].split(",")] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) if page.xpath('//span[@id="lblCompNumber"]/a'): companion_id = (page.xpath('//span[@id="lblCompNumber"]/a') [0].text_content().strip()) bill.add_related_bill( identifier=companion_id, legislative_session=session, relation_type="companion", ) bill.add_source(bill_url) # Primary Sponsor sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']") [0].text_content().split("by")[-1]) sponsor = sponsor.replace("*", "").strip() if sponsor: bill.add_sponsorship(sponsor, classification="primary", entity_type="person", primary=True) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link("Current Version", btext.get("href"), media_type="application/pdf") # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link("Summary", summary[0].get("href")) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link("Fiscal Note", fiscal[0].get("href")) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_version_link( "Amendment " + amendment.text, amendment.get("href"), media_type="application/pdf", ) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link(afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore") # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = ( page.xpath("//span[@id='lblCompPrimeSponsor']") [0].text_content().split("by")[-1]) secondary_sponsor = (secondary_sponsor.replace("*", "").replace( ")", "").strip()) # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification="primary", entity_type="person", primary=True, ) # secondary actions if page.xpath("//table[@id='gvCoActionHistory']"): cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a["date"]) yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) bill_id_for_url = bill_id.replace(" ", "") bill.add_source( f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}" ) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) for version in root.iterfind( "billtext/docTypes/bill/versions/version"): if not version: continue note = version.find("versionDescription").text html_url = version.find("WebHTMLURL").text bill.add_version_link(note=note, url=html_url, media_type="text/html") pdf_url = version.find("WebPDFURL").text bill.add_version_link(note=note, url=pdf_url, media_type="application/pdf") for analysis in root.iterfind( "billtext/docTypes/analysis/versions/version"): if not analysis: continue description = analysis.find("versionDescription").text html_url = analysis.find("WebHTMLURL").text bill.add_document_link( note="Analysis ({})".format(description), url=html_url, media_type="text/html", ) for fiscal_note in root.iterfind( "billtext/docTypes/fiscalNote/versions/version"): if not fiscal_note: continue description = fiscal_note.find("versionDescription").text html_url = fiscal_note.find("WebHTMLURL").text bill.add_document_link( note="Fiscal Note ({})".format(description), url=html_url, media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue atype = _categorize_action(desc) act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "https://legislature.idaho.gov/legislation/%s/" % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + "/" + session[0:4], "%m/%d/%Y").strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace("\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field( page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) version_ct = self.parse_versions(page, bill) if version_ct < 1: # Bill withdrawn self.logger.warning("Bill withdrawn.") return self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath( "//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_bill(self, chamber, session, session_id, bill_id, url): sidebar = lxml.html.fromstring(self.get(url).text) sidebar.make_links_absolute("https://www.legis.iowa.gov") hist_url = (f"https://www.legis.iowa.gov/legislation/billTracking/" f"billHistory?billName={bill_id}&ga={session_id}") req_session = requests.Session() req = requests.get(hist_url) if req.status_code == 500: self.warning("500 error on {}, skipping".format(hist_url)) return page = lxml.html.fromstring(req.text) page.make_links_absolute("https://www.legis.iowa.gov") title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])').strip() if title == "": # Sometimes the title is moved, see # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88 title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])').strip() if title == "": self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if "HR" in bill_id or "SR" in bill_id: bill_type = ["resolution"] elif "HJR" in bill_id or "SJR" in bill_id: bill_type = ["joint resolution"] elif "HCR" in bill_id or "SCR" in bill_id: bill_type = ["concurrent resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(hist_url) # base url for text version (version_abbrev, session_id, bill_id) version_html_url_template = ( "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/attachments/{}.html") version_pdf_url_template = ("https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/{}.pdf") # get pieces of version_link vpieces = sidebar.xpath('//select[@id="billVersions"]/option') if vpieces: for version in vpieces: version_name = version.text version_abbrev = version.xpath("string(@value)") # Get HTML document of bill version. version_html_url = version_html_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "")) bill.add_version_link(note=version_name, url=version_html_url, media_type="text/html") # Get PDF document of bill version. version_pdf_url = version_pdf_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "")) if "Marked Up" in version_name: version_pdf_url = sidebar.xpath( "//iframe[@id='bbContextDoc']/@src")[0] bill.add_version_link(note=version_name, url=version_pdf_url, media_type="application/pdf") sponsors_str = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])').strip() if re.search("^By ", sponsors_str): sponsors = re.split(",| and ", sponsors_str.split("By ")[1]) # for some bills sponsors listed in different format else: sponsors = re.findall(r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)", sponsors_str) for sponsor in sponsors: sponsor = sponsor.replace(" and", "").strip(" .,") # a few sponsors get mangled by our regex sponsor = { "Means": "Ways & Means", "Iowa": "Economic Growth/Rebuild Iowa", "Safety": "Public Safety", "Resources": "Human Resources", "Affairs": "Veterans Affairs", "Protection": "Environmental Protection", "Government": "State Government", "Boef": "De Boef", }.get(sponsor, sponsor) if sponsor[0].islower(): # SSBs catch cruft in it ('charges', 'overpayments') # https://sunlight.atlassian.net/browse/DATA-286 continue bill.add_sponsorship( name=sponsor, classification="primary", entity_type="person", primary=True, ) for tr in page.xpath( "//table[contains(@class, 'billActionTable')][1]/tbody/tr"): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": for anchor in tr.xpath(".//a"): link_text = anchor.text_content() link_url = anchor.xpath("@href")[0] if "signed" in link_text.lower(): bill.add_version_link(note=link_text, url=link_url, media_type="application/pdf") elif "acts" in link_text.lower(): bill.add_document_link(note=link_text, url=link_url, media_type="application/pdf") bill.add_citation( f"IA Acts, {session}", link_text.replace("Acts", ""), citation_type="chapter", url=link_url, ) continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[3])").strip() action = re.sub(r"\s+", " ", action) # Capture any amendment links. links = [ link for link in [version["links"] for version in bill.versions] ] version_urls = [ link["url"] for link in [i for sub in links for i in sub] ] if "amendment" in action.lower(): for anchor in tr.xpath(".//a[1]"): if "-" in anchor.text: # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf" amd_id = anchor.text.replace("-", "").strip() amd_url = amd_pattern.format(session_id, amd_id) amd_name = "Amendment {}".format(anchor.text.strip()) if amd_url not in version_urls: bill.add_version_link(note=amd_name, url=amd_url, media_type="application/pdf") version_urls.append(amd_url) else: self.info( "Already Added {}, skipping".format(amd_url)) else: for anchor in tr.xpath(".//a"): link_text = anchor.text_content() link_url = anchor.xpath("@href")[0] action_date = date.strftime("%m/%d/%Y") if "fiscal" in link_text.lower( ) or "summary" in link_text.lower(): # there can be multiple fiscal notes or summaries, so date them doc_title = f"{link_text} {action_date}" bill.add_document_link(note=doc_title, url=link_url, media_type="application/pdf") elif "signed" in link_text.lower(): bill.add_version_link(note=link_text, url=link_url, media_type="application/pdf") elif "acts" in link_text.lower(): bill.add_document_link(note=link_text, url=link_url, media_type="application/pdf") bill.add_citation( f"IA Acts, {session}", link_text.replace("Acts", ""), citation_type="chapter", url=link_url, ) if "S.J." in action or "SCS" in action: actor = "upper" elif "H.J." in action or "HCS" in action: actor = "lower" else: actor = "legislature" action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip() if action.startswith("Introduced"): atype = ["introduction"] if ", referred to" in action: atype.append("referral-committee") elif action.startswith("Read first time"): atype = "reading-1" elif action.startswith("Referred to"): atype = "referral-committee" elif action.startswith("Sent to Governor"): atype = "executive-receipt" elif action.startswith("Reported Signed by Governor"): atype = "executive-signature" elif action.startswith("Signed by Governor"): atype = "executive-signature" elif action.startswith("Vetoed by Governor"): atype = "executive-veto" elif action.startswith("Item veto"): atype = "executive-veto-line-item" elif re.match(r"Passed (House|Senate)", action): atype = "passage" elif re.match(r"Amendment (S|H)-\d+ filed", action): atype = ["amendment-introduction"] if ", adopted" in action: atype.append("amendment-passage") elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted", action): atype = "amendment-passage" elif re.match(r"Amendment (S|N)-\d+ lost", action): atype = "amendment-failure" elif action.startswith("Resolution filed"): atype = "introduction" elif action.startswith("Resolution adopted"): atype = "passage" elif action.startswith("Committee report") and action.endswith( "passage."): atype = "committee-passage" elif action.startswith("Withdrawn"): atype = "withdrawal" else: atype = None if action.strip() == "": continue if re.search(r"END OF \d+ ACTIONS", action): continue if "$history" not in action: bill.add_action(description=action, date=date, chamber=actor, classification=atype) self.scrape_subjects(bill, bill_id, session, req_session) yield bill