def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = "{}{}".format(qs["billtype"], qs["billnumber"]) versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta["Report Title"].split(";")] if "" in subs: subs.remove("") b = Bill( bill_id, session, meta["Measure Title"], chamber=chamber, classification=bill_type, ) if meta["Description"]: b.add_abstract(meta["Description"], "description") for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = "{} Regular Session".format(str(int(session[:4]) - 1)) companion = meta["Companion"].strip() if companion: b.add_related_bill( identifier=companion.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) if bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" ): prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if "carried over" in prior.lower(): b.add_related_bill( identifier=bill_id.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) for sponsor in meta["Introducer(s)"]: if "(Introduced by request of another party)" in sponsor: sponsor = sponsor.replace( " (Introduced by request of another party)", "") b.add_sponsorship(sponsor, "primary", "person", True) self.parse_bill_versions_table(b, versions) self.parse_testimony(b, bill_page) self.parse_cmte_reports(b, bill_page) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) if details is None: return ( senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version), ) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data["summary"], classification=bill_type, ) if bill_data["summary"]: bill.add_abstract(bill_data["summary"], note="") bill_active_version = None if active_version != "": bill_active_version = bill_data["amendments"]["items"][active_version] else: self.warning("No active version for {}".format(bill_id)) # Parse sponsors. if bill_data["sponsor"] is not None: if bill_data["sponsor"]["rules"] is True: bill.add_sponsorship( "Rules Committee", entity_type="organization", classification="primary", primary=True, ) elif not bill_data["sponsor"]["budget"]: primary_sponsor = bill_data["sponsor"]["member"] bill.add_sponsorship( primary_sponsor["shortName"], entity_type="person", classification="primary", primary=True, ) if bill_active_version: # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version["coSponsors"]["items"] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor["shortName"], entity_type="person", classification="cosponsor", primary=False, ) if bill_active_version: # List companion bill. same_as = bill_active_version.get("sameAs", {}) # Check whether "sameAs" property is populated with at least one bill. if same_as["items"]: # Get companion bill ID. companion_bill_id = same_as["items"][0]["basePrintNo"] # Build companion bill session. start_year = same_as["items"][0]["session"] end_year = start_year + 1 companion_bill_session = "-".join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill( companion_bill_id, companion_bill_session, relation_type="companion" ) # Parse actions. chamber_map = {"senate": "upper", "assembly": "lower"} for action in bill_data["actions"]["items"]: chamber = chamber_map[action["chamber"].lower()] action_datetime = datetime.datetime.strptime(action["date"], "%Y-%m-%d") action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action["text"]) bill.add_action( action["text"], action_date.strftime("%Y-%m-%d"), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources["bill"].format( session_year=session, bill_id=bill_id, summary="", detail="" ) bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. for vote_data in bill_data["votes"]["items"]: yield self._parse_senate_votes(vote_data, bill, api_url) yield from self.scrape_assembly_votes(session, bill, assembly_url, bill_id) # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data["amendments"]["items"] for key, amendment in amendments.items(): version = amendment["printNo"] html_url = ( "http://assembly.state.ny.us/leg/?sh=printbill&bn=" "{}&term={}&Text=Y".format(bill_id, self.term_start_year) ) bill.add_version_link( version, html_url, on_duplicate="ignore", media_type="text/html" ) pdf_url = "http://legislation.nysenate.gov/pdf/bills/{}/{}".format( self.term_start_year, version ) bill.add_version_link( version, pdf_url, on_duplicate="ignore", media_type="application/pdf" ) yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if "*" in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(" ", " ") bill_id = bill_id.replace("*", "").replace(" ", " ").strip() if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" primary_chamber = "lower" if "H" in bill_id else "upper" # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = "%s detail page was missing title info." self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find("-") subjects = [s.strip() for s in title[:subject_pos - 1].split(",")] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) if page.xpath('//span[@id="lblCompNumber"]/a'): companion_id = (page.xpath('//span[@id="lblCompNumber"]/a') [0].text_content().strip()) bill.add_related_bill( identifier=companion_id, legislative_session=session, relation_type="companion", ) bill.add_source(bill_url) # Primary Sponsor sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']") [0].text_content().split("by")[-1]) sponsor = sponsor.replace("*", "").strip() if sponsor: bill.add_sponsorship(sponsor, classification="primary", entity_type="person", primary=True) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link("Current Version", btext.get("href"), media_type="application/pdf") # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link("Summary", summary[0].get("href")) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link("Fiscal Note", fiscal[0].get("href")) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_version_link( "Amendment " + amendment.text, amendment.get("href"), media_type="application/pdf", ) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link(afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore") # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = ( page.xpath("//span[@id='lblCompPrimeSponsor']") [0].text_content().split("by")[-1]) secondary_sponsor = (secondary_sponsor.replace("*", "").replace( ")", "").strip()) # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification="primary", entity_type="person", primary=True, ) # secondary actions if page.xpath("//table[@id='gvCoActionHistory']"): cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a["date"]) yield bill
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.to_csv("MAINBILL.TXT") # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == "A": chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec["IdenticalBillNumber"].strip(): bill.add_related_bill( rec["IdenticalBillNumber"].split()[0], legislative_session=session, relation_type="companion", ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.to_csv("BILLSPON.TXT") for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in sponsor database" % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == "P": sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship( name, classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Documents bill_document_csv = self.to_csv("BILLWP.TXT") for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in document database" % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split("\\") document = document[-2] + "/" + document[-1] htm_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format( year_abr, document.replace(".DOC", ".HTM")) pdf_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format( year_abr, document.replace(".DOC", ".PDF")) # name document based _doctype try: doc_name = self._doctypes[rec["DocType"]] except KeyError: raise Exception("unknown doctype %s on %s" % (rec["DocType"], bill_id)) if rec["Comment"]: doc_name += " " + rec["Comment"] # Clean links. if htm_url.endswith("HTMX"): htm_url = re.sub("X$", "", htm_url) if pdf_url.endswith("PDFX"): pdf_url = re.sub("X$", "", pdf_url) if rec["DocType"] in self._version_types: if htm_url.lower().endswith("htm"): mimetype = "text/html" elif htm_url.lower().endswith("wpd"): mimetype = "application/vnd.wordperfect" try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) bill.add_version_link(doc_name, pdf_url, media_type="application/pdf") except ValueError: self.warning( "Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ "A%s" % year_abr, "A%s" % next_year, "S%s" % year_abr, "S%s" % next_year, "CA%s-%s" % (year_abr, next_year), "CS%s-%s" % (year_abr, next_year), ] # keep votes clean globally, a few votes show up in multiple files votes = {} for filename in vote_info_list: s_vote_url = f"https://www.njleg.state.nj.us/votes/{filename}.zip" try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.HTTPError: self.warning("could not find %s" % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"), encoding="latin-1") except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) if filename.startswith("A") or filename.startswith("CA"): chamber = "lower" else: chamber = "upper" if filename.startswith("C"): vote_file_type = "committee" else: vote_file_type = "chamber" for rec in vdict_file: if vote_file_type == "chamber": bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"]) leg = rec["Name"] # drop time portion date = rec["Agenda_Date"].split()[0] # make motion readable action = self._com_vote_motions[rec["BillAction"]] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec["LegislatorVote"][0:1] committee = rec["Committee_House"] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = "_".join(vote_parts).replace(" ", "_") if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification="passage", result=None, bill=bill_dict[bill_id], ) votes[vote_id].dedupe_key = vote_id if leg_vote == "Y": votes[vote_id].vote("yes", leg) elif leg_vote == "N": votes[vote_id].vote("no", leg) else: votes[vote_id].vote("other", leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count["option"]] += 1 vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) # Veto override. if vote.motion_text == "OVERRIDE": # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if "lower" in vote.bill: vote.result = "pass" if counts["yes"] >= 54 else "fail" elif "upper" in vote.bill: vote.result = "pass" if counts["yes"] >= 27 else "fail" else: # Regular vote. vote.result = "pass" if counts["yes"] > counts[ "no"] else "fail" vote.add_source("http://www.njleg.state.nj.us/downloads.asp") yield vote # Actions bill_action_csv = self.to_csv("BILLHIST.TXT") actor_map = {"A": "lower", "G": "executive", "S": "upper"} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in action database" % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = dateutil.parser.parse(date) actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += " " + comment bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.to_csv("BILLSUBJ.TXT") for rec in subject_csv: bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"])) if bill_id not in bill_dict: self.warning("unknown bill %s in subject database" % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec["SubjectKey"]) else: self.warning("invalid bill id in BillSubj: %s" % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning("probable phony bill detected %s", bill.identifier) phony_bill_count += 1 else: bill.add_source("http://www.njleg.state.nj.us/downloads.asp") yield bill if phony_bill_count: self.warning("%s total phony bills detected", phony_bill_count)
def test_full_bill(): create_jurisdiction() person = Person.objects.create(name="Adam Smith") lower = Organization.objects.create(jurisdiction_id="jid", name="House", classification="lower") Membership.objects.create(person_id=person.id, organization_id=lower.id) Organization.objects.create( jurisdiction_id="jid", name="Arbitrary Committee", classification="committee", parent=lower, ) oldbill = ScrapeBill( "HB 99", "1899", "Axe & Tack Tax Act", classification="tax bill", chamber="lower", ) bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower") bill.subject = ["taxes", "axes"] bill.add_identifier("SB 9") bill.add_title("Tack & Axe Tax Act") bill.add_action("introduced in house", "1900-04-01", chamber="lower") act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower") act.add_related_entity( "arbitrary committee", "organization", _make_pseudo_id(name="Arbitrary Committee"), ) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship( "Adam Smith", classification="extra sponsor", entity_type="person", primary=False, entity_id=_make_pseudo_id(name="Adam Smith"), ) bill.add_sponsorship("Jane Smith", classification="lead sponsor", entity_type="person", primary=True) bill.add_abstract( "This is an act about axes and taxes and tacks.", note="official", date="1969-10-20", ) bill.add_document_link("Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf") bill.add_document_link("Fiscal Note", "http://example.com/fn.html", media_type="text/html") bill.add_version_link("Fiscal Note", "http://example.com/v/1", media_type="text/html") bill.add_source("http://example.com/source") # import bill BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier="HB 1") assert b.from_organization.classification == "lower" assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ["taxes", "axes"] assert b.abstracts.get().note == "official" assert b.abstracts.get().date == "1969-10-20" # other_title, other_identifier added assert b.other_titles.get().title == "Tack & Axe Tax Act" assert b.other_identifiers.get().identifier == "SB 9" # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification="lower") assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert actions[1].related_entities.get( ).organization == Organization.objects.get(classification="committee") # action computed fields assert b.first_action_date == "1900-04-01" assert b.latest_action_date == "1900-04-04" assert b.latest_action_description == "sent to arbitrary committee" # related_bills were added rb = b.related_bills.get() assert rb.identifier == "HB 99" # and bill got resolved assert rb.related_bill.identifier == "HB 99" # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name="Adam Smith") for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1