def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = "{}{}".format(qs["billtype"], qs["billnumber"]) versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta["Report Title"].split(";")] if "" in subs: subs.remove("") b = Bill( bill_id, session, meta["Measure Title"], chamber=chamber, classification=bill_type, ) if meta["Description"]: b.add_abstract(meta["Description"], "description") for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = "{} Regular Session".format(str(int(session[:4]) - 1)) companion = meta["Companion"].strip() if companion: b.add_related_bill( identifier=companion.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) if bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" ): prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if "carried over" in prior.lower(): b.add_related_bill( identifier=bill_id.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) for sponsor in meta["Introducer(s)"]: b.add_sponsorship(sponsor, "primary", "person", True) versions = self.parse_bill_versions_table(b, versions) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()): for legislation_summary in self.parseSearchResults(page): title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary( legislation_summary) for related_bill in legislation_details.get( 'Related files', []): bill.add_related_bill(name=related_bill, session=self.session, relation='other-session', chamber=None) for i, sponsor in enumerate( legislation_details.get('Sponsors', [])): if i == 0: primary = True sponsorship_type = "Primary" else: primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []): bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []): bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()) : for legislation_summary in self.parseSearchResults(page) : title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary(legislation_summary) for related_bill in legislation_details.get('Related files', []) : bill.add_related_bill(name = related_bill, session = self.session, relation='other-session', chamber=None) for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []) : bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []) : bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = '{}{}'.format(qs['billtype'], qs['billnumber']) versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta['Report Title'].split(";")] if "" in subs: subs.remove("") b = Bill(bill_id, session, meta['Measure Title'], chamber=chamber, classification=bill_type) if meta['Description']: b.add_abstract(meta['Description'], 'description') for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1)) companion = meta['Companion'].strip() if companion: b.add_related_bill(identifier=companion.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1] if 'carried over' in prior.lower(): b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") for sponsor in meta['Introducer(s)']: b.add_sponsorship(sponsor, 'primary', 'person', True) versions = self.parse_bill_versions_table(b, versions) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def parse_bill_status_page(self, status_url, bill_url, session, chamber): status_page = lxml.html.fromstring(self.get(status_url).text) # see 2007 HB 2... weird. bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf' bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href' bill_id = re.search(bill_re, status_page.xpath(bill_xpath)[0], re.IGNORECASE).groups() bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1])) try: xp = '//b[text()="Short Title:"]/../following-sibling::td/text()' title = status_page.xpath(xp).pop() except IndexError: title = status_page.xpath('//tr[1]/td[2]')[0].text_content() # Add bill type. _bill_id = bill_id.lower() if 'b' in _bill_id: classification = 'bill' elif 'j' in _bill_id or 'jr' in _bill_id: classification = 'joint resolution' elif 'cr' in _bill_id: classification = 'concurrent resolution' elif 'r' in _bill_id: classification = 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) self.add_actions(bill, status_page) votes = self.add_votes(bill, status_page, status_url) tabledata = self._get_tabledata(status_page) # Add sponsor info. bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary', entity_type='person', primary=True) # A various plus fields MT provides. plus_fields = [ 'requester', ('chapter number:', 'chapter'), 'transmittal date:', 'drafter', 'fiscal note probable:', 'bill draft number:', 'preintroduction required:', 'by request of', 'category:'] for x in plus_fields: if isinstance(x, tuple): _key, key = x else: _key = key = x key = key.replace(' ', '_') try: val = tabledata[_key] except KeyError: continue if len(val) == 1: val = val[0] bill.extras[key] = val # Add bill subjects. xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr' subjects = [] for tr in status_page.xpath(xp): try: subj = tr.xpath('td')[0].text_content() except: continue subjects.append(subj) for s in subjects: bill.add_subject(s) self.add_fiscal_notes(status_page, bill) return bill, list(votes)
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation( created_after=datetime.datetime(2015, 5, 17)): title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date']: continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session( self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) bill.add_source(leg_summary['url']) try: leg_details = self.legDetails(leg_summary['url']) except IndexError: unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []): lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill( identifier=related_bill['label'], legislative_session=bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])): if i == 0: primary = True sponsorship_type = "Primary" else: primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')): sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm', )): sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith( ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')): bill.add_sponsorship( sponsor_name, sponsorship_type, entity_type, primary, entity_id=_make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details: for subject in leg_details[u'Topic'].split(','): bill.add_subject(subject) for attachment in leg_details.get('Attachments', []): if attachment['label']: bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']): action_description = action['Action'] try: action_date = self.toTime( action['Date']).date().isoformat() except AttributeError: # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description: try: responsible_org = action['Action\xa0By']['label'] except TypeError: responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'Chicago City Council' act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[ action_description]) if action_description == 'Referred': try: leg_details[ 'Current Controlling Legislative Body'][ 'label'] controlling_bodies = [ leg_details[ 'Current Controlling Legislative Body'] ] except TypeError: controlling_bodies = leg_details[ 'Current Controlling Legislative Body'] if controlling_bodies: for controlling_body in controlling_bodies: body_name = controlling_body['label'] if body_name.startswith("Joint Committee"): act.add_related_entity( body_name, 'organization') else: act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id( name=body_name)) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result: # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification': leg_summary['Type']} yield bill print(unreachable_urls)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Check if bill hasn't been transmitted to the other chamber yet transmit_check = self.get_node( doc, '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()' ) if (transmit_check is not None and 'has not been transmitted' in transmit_check.strip()): self.logger.debug('Bill has not been transmitted to other chamber ' '... skipping {0}'.format(bill_detail_url)) return # Get the basic parts of the bill bill_id = self.get_node(doc, '//h1/text()') self.logger.debug(bill_id) bill_title_text = self.get_node( doc, '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()' ) if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node( doc, '//a[text()[contains(.,"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node( long_desc_page, '//h1/' 'following-sibling::p/text()') if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = 'No title found.' self.logger.warning('No title found for {}.'.format(bill_id)) self.logger.debug(bill_title) bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) # Add source bill.add_source(bill_detail_url) for subject in self._subject_mapping[bill_id]: bill.add_subject(subject) # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]' '/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) yield bill
def scrape_matter(self, matter_link, sess): matter_types = { "Additions":"other", "Administrative Order":"order", "Annual Evaluation":"other", "Bid Advertisement":"other", "Bid Awards":"other", "Bid Contract":"contract", "Bid Protest":"other", "Bid Rejection":"other", "Birthday Scroll":"commemoration", "Certificate of Appreciation":"commemoration", "Change Order":"order", "Citizen's Presentation":"other", "Commendation":"commemoration", "Conflict Waiver":"other", "Congratulatory Certificate":"commemoration", "Deferrals":"other", "Discussion Item":"other", "Distinguished Visitor":"other", "Joint Meeting/Workshop":"other", "Mayoral Veto":"other", "Miscellaneous":"other", "Nomination":"nomination", "Oath of Office":"other", "Omnibus Reserve":"bill", "Ordinance":"ordinance", "Plaque":"commemoration", "Presentation":"other", "Proclamation":"proclamation", "Professional Service Agreement":"contract", "Public Hearing":"other", "Report":"other", "Request for Proposals":"other", "Request for Qualifications":"other", "Request to Advertise":"other", "Resolution":"resolution", "Resolution of Sympathy":"resolution", "Service Awards":"commemoration", "Special Item":"other", "Special Presentation":"other", "Supplement":"other", "Swearing-In":"other", "Time Sensitive Items":"other", "Withdrawals":"other", "Workshop Item":"other", "Zoning":"other", "Zoning Resolution":"resolution" } matter_doc = self.lxmlize(matter_link) info_dict = self.matter_table_to_dict(matter_doc) #we're going to use the year of the intro date as the session #until/unless we come up with something better intro_date = datetime.strptime(info_dict["Introduced"],"%m/%d/%Y") session = sess["identifier"] category = matter_types[info_dict["File Type"]] if 'File Name' in info_dict: title = info_dict["File Name"] elif "Title" in info_dict and info_dict["Title"].strip(): title = info_dict["Title"].strip() else: self.warning("bill has no title") return if category == 'other': bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title ) else: bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title, classification=category ) for spons in info_dict["Sponsors"]: if spons == "NONE": continue try: name,spons_type = spons.rsplit(",",1) except ValueError: name = spons spons_type = "Sponsor" primary = True if "Prime Sponsor" in spons_type else False entity = "person" if "committee" in name: entity = committee bill.add_sponsorship(name,spons_type,entity,primary) if "Indexes" in info_dict: for subj in info_dict["Indexes"]: if subj.strip() and subj.strip() != "NONE": bill.add_subject(subj.strip()) if "Title" in info_dict and info_dict["Title"].strip(): note = "bill's long title'" if ("Note" in info_dict and info_dict["Note"].strip()): note = info_dict["Note"] bill.add_abstract(abstract=info_dict["Title"],note=note) self.process_action_table(matter_doc,bill) bill.add_source(matter_link, note='web') yield bill
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) : title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date'] : continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) bill.add_source(leg_summary['url']) try : leg_details = self.legDetails(leg_summary['url']) except IndexError : unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []) : lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')) : sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm',)) : sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith(('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')) : bill.add_sponsorship(sponsor_name, sponsorship_type, entity_type, primary, entity_id = _make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details : for subject in leg_details[u'Topic'].split(',') : bill.add_subject(subject) for attachment in leg_details.get('Attachments', []) : if attachment['label'] : bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']) : action_description = action['Action'] try : action_date = self.toTime(action['Date']).date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : try : responsible_org = action['Action\xa0By']['label'] except TypeError : responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'Chicago City Council' act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[action_description]) if action_description == 'Referred' : try : leg_details['Current Controlling Legislative Body']['label'] controlling_bodies = [leg_details['Current Controlling Legislative Body']] except TypeError : controlling_bodies = leg_details['Current Controlling Legislative Body'] if controlling_bodies : for controlling_body in controlling_bodies : body_name = controlling_body['label'] if body_name.startswith("Joint Committee") : act.add_related_entity(body_name, 'organization') else : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification' : leg_summary['Type']} yield bill print(unreachable_urls)
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = doc.xpath( '//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0], entity_type='person', classification='primary', primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type='person', classification='cosponsor', primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsorship( spons_str, entity_type='person', classification='primary', primary=True, ) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: yield from self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action(action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'), classification=atype) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions text_list_url = ("http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s") % ( session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version_link(name, text_url, media_type="text/html") # Get documents doc_list_url = ("http://www.legis.state.ak.us/" "basis/get_documents.asp?session=%s&bill=%s") % ( session, bill_id) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document_link(h_name, h_href) yield bill
def scrape_bill(self, chamber, session, bill_id): # there will be a space in bill_id if we're doing a one-off bill scrape # convert HB 102 into H102 if ' ' in bill_id: bill_id = bill_id[0] + bill_id.split(' ')[-1] # if chamber comes in as House/Senate convert to lower/upper if chamber == 'Senate': chamber = 'upper' elif chamber == 'House': chamber = 'lower' bill_detail_url = ( 'http://www.ncleg.net/gascripts/' 'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all' ) % (session, bill_id) # parse the bill data page, finding the latest html text data = self.get(bill_detail_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(bill_detail_url) title_div_txt = doc.xpath( '//td[@style="text-align: center; white-space: nowrap; ' 'width: 60%; font-weight: bold; font-size: x-large;"]/text()')[0] if 'Joint Resolution' in title_div_txt: bill_type = 'joint resolution' bill_id = bill_id[0] + 'JR ' + bill_id[1:] elif 'Resolution' in title_div_txt: bill_type = 'resolution' bill_id = bill_id[0] + 'R ' + bill_id[1:] elif 'Bill' in title_div_txt: bill_type = 'bill' bill_id = bill_id[0] + 'B ' + bill_id[1:] bill_title = doc.xpath('//div[@id="title"]')[0].text_content() bill = Bill(bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=bill_type) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == 'lower': link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace(u'\xa0', ' ') # but neighboring span with anchor inside has the HTML version version_url = vlink.xpath('./following-sibling::span/a/@href') version_url = 'http://www.ncleg.net' + version_url[0] bill.add_version_link(version_name, version_url, media_type='text/html', on_duplicate='ignore') # sponsors spon_td = doc.xpath( '//th[text()="Sponsors:"]/following-sibling::td')[0] # first sponsors are primary, until we see (Primary) spon_type = 'primary' for leg in spon_td.text_content().split(';'): name = leg.replace(u'\xa0', ' ').strip() if name.startswith('(Primary)'): name = name.replace('(Primary)', '').strip() spon_type = 'cosponsor' if not name: continue bill.add_sponsorship(name, classification=spon_type, entity_type='person', primary=(spon_type == 'primary')) # keywords kw_td = doc.xpath('//th[text()="Keywords:"]/following-sibling::td')[0] for subject in kw_td.text_content().split(', '): bill.add_subject(subject) # actions action_tr_xpath = '//td[starts-with(text(),"History")]/../../tr' # skip two header rows for row in doc.xpath(action_tr_xpath)[2:]: tds = row.xpath('td') act_date = tds[0].text actor = tds[1].text or '' # if text is blank, try diving in action = tds[2].text.strip() or tds[2].text_content().strip() act_date = dt.datetime.strptime(act_date, '%m/%d/%Y').strftime('%Y-%m-%d') if actor == 'Senate': actor = 'upper' elif actor == 'House': actor = 'lower' else: actor = 'executive' for pattern, atype in self._action_classifiers.items(): if action.startswith(pattern): break else: atype = None bill.add_action(action, act_date, chamber=actor, classification=atype) yield from self.scrape_votes(bill, doc) yield bill
def _scrape_bills(self): """ Does the following 1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module 2) Iterates over bill data and converts each one to an OCD-compliant bill model. 3) Yields the OCD-compliant bill model instance @return: generator for federal US bills in OCD-compliant format @rtype: generator """ # run scraper first to pull in all the bill data self._run_unitedstates_bill_scraper() # iterate over all the files and build and yield Bill objects for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'): try: with open(filename) as json_file: json_data = json.load(json_file) # Initialize Object bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'], json_data['congress'], json_data['official_title'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'] ) # add source of data bill.add_source(json_data['url'], note='all') # add subjects for subject in json_data['subjects']: bill.add_subject(subject) # add summary if 'summary' in json_data and json_data['summary'] is not None: bill.add_abstract(json_data['summary']['text'], json_data['summary']['as'], json_data['summary']['date']) # add titles for item in json_data['titles']: bill.add_title(item['title'], item['type']) # add other/related Bills for b in json_data['related_bills']: if 'type' in b and b['type'] == 'bill': split = b['bill_id'].split('-') m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0]) bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2), legislative_session=split[1], relation_type='companion') # add sponsor bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True, scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add cosponsors for cs in json_data['cosponsors']: bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False, scheme='thomas_id', identifier=cs['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add introduced_at and actions bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']), chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'], related_entities=[]) # add other actions for action in json_data['actions']: bill.actions.append({'date': datetime_to_date(action['acted_at']), 'type': [action['type']], 'description': action['text'], 'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) # add bill versions for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR, 'data', bill.legislative_session, 'bills', json_data['bill_type'], json_data['bill_type'] + json_data['number'], 'text-versions'), '/.*/*\.json'): try: with open(version_path) as version_file: version_json_data = json.load(version_file) for k, v in version_json_data['urls'].items(): bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']), 'type': version_json_data['version_code'], 'name': constants.VERSION_MAP[version_json_data['version_code']], 'links': [{'mimetype': k, 'url': v}]}) except IOError: print("Unable to open or parse file with path " + version_path) continue # finally yield bill object yield bill except IOError: print("Unable to open file with path " + filename) print(traceback.format_exc()) continue except KeyError: print("Unable to parse file with path " + filename) print(traceback.format_exc()) continue except: print('Unknown error with ' + filename) print(traceback.format_exc()) continue
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if (bill_title is None or "Bill does not exist" in history_xml): self.warning("Bill does not appear to exist") return bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type ) bill.add_source(history_url) for subject in root.iterfind('subjects/subject'): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type='text/html' ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type='text/html' ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format(self.NAME_SLUGS [fiscal_note[1][-5]]), url=fiscal_note[1], media_type='text/html' ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format(self.NAME_SLUGS [witness[1][-5]]), url=witness[1], media_type='text/html' ) for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() action_number = action.find('actionNumber').text actor = {'H': 'lower', 'S': 'upper', 'E': 'executive'}[action_number[0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': self.warning("Skipping public hearing action with no date") continue introduced = False if desc == 'Amended': atype = 'amendment-passage' elif desc == 'Amendment(s) offered': atype = 'amendment-introduction' elif desc == 'Amendment amended': atype = 'amendment-amendment' elif desc == 'Amendment withdrawn': atype = 'amendment-withdrawal' elif desc == 'Passed' or desc == 'Adopted': atype = 'passage' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'introduction' else: atype = 'filing' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'executive-receipt' elif desc.startswith('Signed by the Governor'): atype = 'executive-signature' elif desc == 'Vetoed by the Governor': atype = 'executive-veto' elif desc == 'Read first time': atype = ['introduction', 'reading-1'] introduced = True elif desc == 'Read & adopted': atype = ['passage'] if not introduced: introduced = True atype.append('introduction') elif desc == "Passed as amended": atype = 'passage' elif (desc.startswith('Referred to') or desc.startswith("Recommended to be sent to ")): atype = 'referral-committee' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee-passage' elif desc == "Filed": atype = 'filing' elif desc == 'Read 3rd time': atype = 'reading-3' elif desc == 'Read 2nd time': atype = 'reading-2' elif desc.startswith('Reported favorably'): atype = 'committee-passage-favorable' else: atype = None act = bill.add_action( action.findtext('description'), act_date, chamber=actor, classification=atype ) if atype and 'referral-committee' in atype: repls = [ 'Referred to', "Recommended to be sent to " ] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type='organization') for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsorship(author, classification='primary', entity_type='person', primary=True) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsorship(coauthor, classification='cosponsor', entity_type='person', primary=False) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsorship(sponsor, classification='primary', entity_type='person', primary=True) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsorship(cosponsor, classification='cosponsor', entity_type='person', primary=False) if root.findtext('companions'): self._get_companion(bill) yield bill
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath('//LASTACTION/MSRGROUP'): bill_id = mr.xpath('string(MEASURE)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B': 'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(ACTIONLINK)').replace("..", "") main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf%s' % (session, link) try: details_page = self.get(bill_details_url) except scrapelib.HTTPError: self.warning('Bill page not loading for {}; skipping'.format(bill_id)) continue page = details_page.content # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath('string(//SHORTTITLE)') longtitle = details_root.xpath('string(//LONGTITLE)') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.extras['summary'] = longtitle bill.add_source(main_doc_url) # sponsors main_sponsor = details_root.xpath('string(//P_NAME)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_") main_sponsor_url = ('http://billstatus.ls.state.ms.us/%s/' 'pdf/%s') % (session, main_sponsor_link.strip('../')) type = "primary" bill.add_source(main_sponsor_url) bill.add_sponsorship(main_sponsor, classification=type, entity_type='person', primary=True) for author in details_root.xpath('//AUTHORS/ADDITIONAL'): leg = author.xpath('string(CO_NAME)').replace(" ", "_") if leg: leg_url = ('http://billstatus.ls.state.ms.us/%s/' 'pdf/House_authors/%s.xml') % (session, leg) type = "cosponsor" bill.add_source(leg_url) bill.add_sponsorship(leg, classification=type, entity_type='person', primary=False ) # Versions curr_version = details_root.xpath('string(//CURRENT_OTHER' ')').replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" \ + curr_version bill.add_version_link("Current version", curr_version_url, on_duplicate="ignore", media_type="text/html" ) intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/"\ + intro_version bill.add_version_link("As Introduced", intro_version_url, on_duplicate='ignore', media_type='text/html') comm_version = details_root.xpath('string(//CMTESUB_OTHER' ')').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version_link("Committee Substitute", comm_version_url, on_duplicate='ignore', media_type='text/html') passed_version = details_root.xpath('string(//PASSED_OTHER' ')').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version_link(title, passed_version_url, on_duplicate='ignore', media_type='text/html') asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version_link("Approved by the Governor", asg_version_url, on_duplicate='ignore', media_type='text/html') # amendments # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml for amd in details_root.xpath('//AMENDMENTS/*'): if amd.tag == 'HAM': name = amd.xpath('HAM_DESC[1]/text()')[0] name = append_parens(amd, 'HAM_DISP', name) name = append_parens(amd, 'HAM_VDESC', name) pdf_url = amd.xpath('string(HAM_PDF' ')').replace("../", "") html_url = amd.xpath('string(HAM_OTHER' ')').replace("../", "") elif amd.tag == 'SAM': name = amd.xpath('SAM_DESC[1]/text()')[0] name = append_parens(amd, 'SAM_DISP', name) name = append_parens(amd, 'SAM_VDESC', name) pdf_url = amd.xpath('string(SAM_PDF' ')').replace("../", "") html_url = amd.xpath('string(SAM_OTHER' ')').replace("../", "") elif amd.tag == 'AMRPT': name = amd.xpath('AMRPT_DESC[1]/text()')[0] pdf_url = amd.xpath('string(AMRPT_PDF' ')').replace("../", "") html_url = amd.xpath('string(AMRPT_OTHER' ')').replace("../", "") pdf_url = 'http://billstatus.ls.state.ms.us/' + pdf_url html_url = 'http://billstatus.ls.state.ms.us/' + html_url if 'adopted' in name.lower() or 'amendment report' in name.lower(): bill.add_version_link(name, pdf_url, on_duplicate='ignore', media_type='application/pdf') bill.add_version_link(name, html_url, on_duplicate='ignore', media_type='text/html') # avoid duplicate votes seen_votes = set() # Actions for action in details_root.xpath('//HISTORY/ACTION'): # action_num = action.xpath('string(ACT_NUMBER)').strip() # action_num = int(action_num) act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "") action_desc = action.xpath('string(ACT_DESC)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if "Veto" in action and actor == 'executive': version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document_link("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(action, self._tz.localize(date), chamber=actor, classification=atype if atype != 'other' else None) # use committee names as scraped subjects subjects = details_root.xpath('//H_NAME/text()') subjects += details_root.xpath('//S_NAME/text()') for subject in subjects: if subject not in bill.subject: bill.add_subject(subject) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) yield from self.scrape_votes(vote_url, action, date, actor, bill) bill.add_source(bill_details_url) yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = ['Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York'] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') return bill
def scrape(self, window=28) : n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for matter in self.matters(n_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link('Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if '*' in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(' ', ' ') bill_id = bill_id.replace('*', '').replace(' ', ' ').strip() if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' primary_chamber = 'lower' if 'H' in bill_id else 'upper' # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = '%s detail page was missing title info.' self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find('-') subjects = [s.strip() for s in title[:subject_pos - 1].split(',')] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*', '').strip() if sponsor: bill.add_sponsorship( sponsor, classification='primary', entity_type='person', primary=True, ) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link('Current Version', btext.get('href'), media_type='application/pdf') # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link('Summary', summary[0].get('href')) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link('Fiscal Note', fiscal[0].get('href')) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_document_link('Amendment ' + amendment.text, amendment.get('href')) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link( afn.get('alt'), afn.getparent().get('href'), on_duplicate='ignore' ) # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = page.xpath( "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1] secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip() # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification='primary', entity_type='person', primary=True, ) # secondary actions cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a['date']) yield bill
def scrape_details(self, bill_detail_url, session, chamber, bill_id): """ Create the Bill and add the information obtained from the provided bill_detail_url. and then yield the bill object. :param bill_detail_url: :param session: :param chamber: :param bill_id: :return: """ page = self.get(bill_detail_url).text if 'INVALID BILL NUMBER' in page: self.warning('INVALID BILL %s' % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath('span/text()')[0] if 'General Bill' in bill_type: bill_type = 'bill' elif 'Concurrent Resolution' in bill_type: bill_type = 'concurrent resolution' elif 'Joint Resolution' in bill_type: bill_type = 'joint resolution' elif 'Resolution' in bill_type: bill_type = 'resolution' else: raise ValueError('unknown bill type: %s' % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill( bill_id, legislative_session=session, # session name metadata's `legislative_sessions` chamber=chamber, # 'upper' or 'lower' title=bill_summary, classification=bill_type ) subjects = list(self._subjects[bill_id]) for subject in subjects: bill.add_subject(subject) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsorship( name=sponsor, classification='primary', primary=True, entity_type='person' ) for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace(u'\xa0', ' ').strip() bill.add_sponsorship( name=sponsor, classification='primary', primary=True, entity_type='organization' ) # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.get(version_url).text version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version_link( note=version.text, # Description of the version from the state; # eg, 'As introduced', 'Amended', etc. url=version.get('href'), on_duplicate='ignore', media_type='text/html' # Still a MIME type ) # actions for row in bill_div.xpath('table/tr'): date_td, chamber_td, action_td = row.xpath('td') date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = {'Senate': 'upper', 'House': 'lower', None: 'legislature'}[chamber_td.text] action = action_td.text_content() action = action.split('(House Journal')[0] action = action.split('(Senate Journal')[0].strip() atype = action_type(action) bill.add_action( description=action, # Action description, from the state date=date.strftime('%Y-%m-%d'), # `YYYY-MM-DD` format chamber=action_chamber, # 'upper' or 'lower' classification=atype # Options explained in the next section ) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] yield from self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = doc.xpath('//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0], entity_type='person', classification='primary', primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type='person', classification='cosponsor', primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsorship( spons_str, entity_type='person', classification='primary', primary=True, ) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: yield from self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action( action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'), classification=atype) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions text_list_url = ( "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" ) % (session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version_link(name, text_url, media_type="text/html") # Get documents doc_list_url = ( "http://www.legis.state.ak.us/" "basis/get_documents.asp?session=%s&bill=%s" ) % (session, bill_id) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath('//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document_link(h_name, h_href) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link(link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore') sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + '/following-sibling::div[1]/p/a') for link in sponsor_links: if link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislators/'): sponsor_type = 'person' elif link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislative_Session/Committees' ): sponsor_type = 'organization' else: raise ScrapeError('Found unexpected sponsor, URL: ' + link.attrib['href']) bill.add_sponsorship(link.text, classification='primary', primary=True, entity_type=sponsor_type) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == '': self.debug( 'Skipping action table row that is completely empty') continue if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') if re.match(r'Signed by (?:the\s)*Governor', action, re.IGNORECASE): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match(r'\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_bill(self, chamber, session, session_id, bill_id, url): sidebar = lxml.html.fromstring(self.get(url).text) sidebar.make_links_absolute("https://www.legis.iowa.gov") try: hist_url = sidebar.xpath( '//a[contains(., "Bill History")]')[0].attrib['href'] except IndexError: # where is it? return page = lxml.html.fromstring(self.get(hist_url).text) page.make_links_absolute("https://www.legis.iowa.gov") title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div[not(@class)])').strip() if title == '': self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if 'HR' in bill_id or 'SR' in bill_id: bill_type = ['resolution'] elif 'HJR' in bill_id or 'SJR' in bill_id: bill_type = ['joint resolution'] elif 'HCR' in bill_id or 'SCR' in bill_id: bill_type = ['concurrent resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(hist_url) # base url for text version (version_abbrev, session_id, bill_id) version_html_url_template = 'https://www.legis.iowa.gov/docs/'\ 'publications/LG{}/{}/attachments/{}.html' version_pdf_url_template = 'https://www.legis.iowa.gov/docs/'\ 'publications/LG{}/{}/{}.pdf' # get pieces of version_link vpieces = sidebar.xpath('//select[@id="billVersions"]/option') if vpieces: for version in vpieces: version_name = version.text version_abbrev = version.xpath('string(@value)') # Get HTML document of bill version. version_html_url = version_html_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(' ', '')) bill.add_version_link(note=version_name, url=version_html_url, media_type='text/html') # Get PDF document of bill version. version_pdf_url = version_pdf_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(' ', '')) bill.add_version_link(note=version_name, url=version_pdf_url, media_type='application/pdf') sponsors_str = page.xpath( "string(//div[@id='content']/div[@class='divideVert']/div[@class='divideVert'])" ).strip() if re.search('^By ', sponsors_str): sponsors = re.split(',| and ', sponsors_str.split('By ')[1]) # for some bills sponsors listed in different format else: sponsors = re.findall('[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)', sponsors_str) for sponsor in sponsors: sponsor = sponsor.replace(' and', '').strip(' .,') # a few sponsors get mangled by our regex sponsor = { 'Means': 'Ways & Means', 'Iowa': 'Economic Growth/Rebuild Iowa', 'Safety': 'Public Safety', 'Resources': 'Human Resources', 'Affairs': 'Veterans Affairs', 'Protection': 'Environmental Protection', 'Government': 'State Government', 'Boef': 'De Boef' }.get(sponsor, sponsor) if sponsor[0].islower(): # SSBs catch cruft in it ('charges', 'overpayments') # https://sunlight.atlassian.net/browse/DATA-286 continue bill.add_sponsorship(name=sponsor, classification='primary', entity_type='person', primary=True) for tr in page.xpath( "//table[contains(@class, 'billActionTable')]/tbody/tr"): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[2])").strip() action = re.sub(r'\s+', ' ', action) # Capture any amendment links. links = [ link for link in [version['links'] for version in bill.versions] ] version_urls = [ link['url'] for link in [i for sub in links for i in sub] ] if 'amendment' in action.lower(): for anchor in tr.xpath('td[2]/a'): if '-' in anchor.text: # These links aren't given hrefs for some reason # (needs to be fixed upstream) try: url = anchor.attrib['href'] except KeyError: continue if url not in version_urls: bill.add_version_link(note=anchor.text, url=url, media_type='text/html') version_urls.append(url) if 'S.J.' in action or 'SCS' in action: actor = 'upper' elif 'H.J.' in action or 'HCS' in action: actor = 'lower' else: actor = "legislature" action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip() if action.startswith('Introduced'): atype = ['introduction'] if ', referred to' in action: atype.append('referral-committee') elif action.startswith('Read first time'): atype = 'reading-1' elif action.startswith('Referred to'): atype = 'referral-committee' elif action.startswith('Sent to Governor'): atype = 'executive-receipt' elif action.startswith('Reported Signed by Governor'): atype = 'executive-signature' elif action.startswith('Signed by Governor'): atype = 'executive-signature' elif action.startswith('Vetoed by Governor'): atype = 'executive-veto' elif action.startswith('Item veto'): atype = 'executive-veto-line-item' elif re.match(r'Passed (House|Senate)', action): atype = 'passage' elif re.match(r'Amendment (S|H)-\d+ filed', action): atype = ['amendment-introduction'] if ', adopted' in action: atype.append('amendment-passage') elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted', action): atype = 'amendment-passage' elif re.match('Amendment (S|N)-\d+ lost', action): atype = 'amendment-failure' elif action.startswith('Resolution filed'): atype = 'introduction' elif action.startswith('Resolution adopted'): atype = 'passage' elif (action.startswith('Committee report') and action.endswith('passage.')): atype = 'committee-passage' elif action.startswith('Withdrawn'): atype = 'withdrawal' else: atype = None if action.strip() == "": continue if re.search('END OF \d+ ACTIONS', action): continue if '$history' not in action: bill.add_action(description=action, date=date, chamber=actor, classification=atype) for subject in self._subjects[bill_id]: bill.add_subject(subject['Name']) yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url, verify=False).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type) bill.add_source(url) for subject in self._subjects[bill_id.replace(' ', '')]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, 'short title') # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsorship(name=sponsors.strip(), entity_type="organization", primary=True, classification='primary') else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship(classification='primary', name=person, entity_type="person", primary=True) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y").strftime('%Y-%m-%d') if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype ) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link( link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore' ) sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + '/following-sibling::div[1]/p/a' ) for link in sponsor_links: if link.attrib['href'].startswith('https://sdlegislature.gov/Legislators/'): sponsor_type = 'person' elif link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislative_Session/Committees' ): sponsor_type = 'organization' else: raise ScrapeError( 'Found unexpected sponsor, URL: ' + link.attrib['href'] ) bill.add_sponsorship( link.text, classification='primary', primary=True, entity_type=sponsor_type ) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == '': self.debug('Skipping action table row that is completely empty') continue if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') if re.match(r'Signed by (?:the\s)*Governor', action, re.IGNORECASE): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0] version_name = amd.xpath('string(.)') version_url = amd.xpath('@href')[0] if 'htm' in version_url: mimetype = 'text/html' elif 'pdf' in version_url: mimetype = 'application/pdf' bill.add_version_link( version_name, version_url, media_type=mimetype, on_duplicate='ignore' ) if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match(r'\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape(self) : three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] if body_name != 'City Council' : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_version_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath('//LASTACTION/MSRGROUP'): bill_id = mr.xpath('string(MEASURE)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = { 'B': 'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination' }[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(ACTIONLINK)').replace("..", "") main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % ( session, link) try: details_page = self.get(bill_details_url) except scrapelib.HTTPError: self.warning( 'Bill page not loading for {}; skipping'.format(bill_id)) continue page = details_page.content # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath('string(//SHORTTITLE)') longtitle = details_root.xpath('string(//LONGTITLE)') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.extras['summary'] = longtitle bill.add_source(main_doc_url) # sponsors main_sponsor = details_root.xpath('string(//P_NAME)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath( 'string(//P_LINK)').replace(" ", "_") main_sponsor_url = ( 'http://billstatus.ls.state.ms.us/%s/' 'pdf/%s') % (session, main_sponsor_link.strip('../')) type = "primary" bill.add_source(main_sponsor_url) bill.add_sponsorship(main_sponsor, classification=type, entity_type='person', primary=True) for author in details_root.xpath('//AUTHORS/ADDITIONAL'): leg = author.xpath('string(CO_NAME)').replace(" ", "_") if leg: leg_url = ('http://billstatus.ls.state.ms.us/%s/' 'pdf/House_authors/%s.xml') % (session, leg) type = "cosponsor" bill.add_source(leg_url) bill.add_sponsorship(leg, classification=type, entity_type='person', primary=False) # Versions curr_version = details_root.xpath('string(//CURRENT_OTHER' ')').replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" \ + curr_version bill.add_version_link("Current version", curr_version_url, on_duplicate="ignore", media_type="text/html") intro_version = details_root.xpath( 'string(//INTRO_OTHER)').replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/"\ + intro_version bill.add_version_link("As Introduced", intro_version_url, on_duplicate='ignore', media_type='text/html') comm_version = details_root.xpath('string(//CMTESUB_OTHER' ')').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version_link("Committee Substitute", comm_version_url, on_duplicate='ignore', media_type='text/html') passed_version = details_root.xpath('string(//PASSED_OTHER' ')').replace( "../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version_link(title, passed_version_url, on_duplicate='ignore', media_type='text/html') asg_version = details_root.xpath('string(//ASG_OTHER)').replace( "../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version_link("Approved by the Governor", asg_version_url, on_duplicate='ignore', media_type='text/html') # avoid duplicate votes seen_votes = set() # Actions for action in details_root.xpath('//HISTORY/ACTION'): # action_num = action.xpath('string(ACT_NUMBER)').strip() # action_num = int(action_num) act_vote = action.xpath('string(ACT_VOTE)').replace( "../../../..", "") action_desc = action.xpath('string(ACT_DESC)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if "Veto" in action and actor == 'executive': version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document_link("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action( action, self._tz.localize(date), chamber=actor, classification=atype if atype is not 'other' else None) # use committee names as scraped subjects subjects = details_root.xpath('//H_NAME/text()') subjects += details_root.xpath('//S_NAME/text()') for subject in subjects: if subject not in bill.subject: bill.add_subject(subject) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) yield from self.scrape_votes(vote_url, action, date, actor, bill) bill.add_source(bill_details_url) yield bill
def scrape_bill(self, chamber, session, bill_id): # there will be a space in bill_id if we're doing a one-off bill scrape # convert HB 102 into H102 if ' ' in bill_id: bill_id = bill_id[0] + bill_id.split(' ')[-1] # if chamber comes in as House/Senate convert to lower/upper if chamber == 'Senate': chamber = 'upper' elif chamber == 'House': chamber = 'lower' bill_detail_url = ('http://www.ncleg.net/gascripts/' 'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all') % ( session, bill_id) # parse the bill data page, finding the latest html text data = self.get(bill_detail_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(bill_detail_url) title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0] if 'Joint Resolution' in title_div_txt: bill_type = 'joint resolution' bill_id = bill_id[0] + 'JR ' + bill_id[1:] elif 'Resolution' in title_div_txt: bill_type = 'resolution' bill_id = bill_id[0] + 'R ' + bill_id[1:] elif 'Bill' in title_div_txt: bill_type = 'bill' bill_id = bill_id[0] + 'B ' + bill_id[1:] bill_title = doc.xpath( '/html/body/div/div/main/div[2]/div[contains(@class,"col-12")]/a')[0] bill_title = bill_title.text_content().strip() bill = Bill(bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=bill_type) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == 'lower': link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace(u'\xa0', ' ') version_url = vlink.attrib['href'] media_type = 'text/html' if version_url.lower().endswith(".pdf"): media_type = 'application/pdf' bill.add_version_link(version_name, version_url, media_type=media_type, on_duplicate='ignore') # rows with a 'adopted' in the text and an amendment link, skip failed amds for row in doc.xpath('//div[@class="card-body"]/div[contains(., "Adopted")' ' and contains(@class,"row")]//a[@title="Amendment"]'): version_url = row.xpath('@href')[0] version_name = row.xpath('string(.)').strip() bill.add_version_link(version_name, version_url, media_type='application/pdf', on_duplicate='ignore') # sponsors spon_row = doc.xpath('//div[contains(text(), "Sponsors")]/following-sibling::div')[0] # first sponsors are primary, until we see (Primary) spon_type = 'primary' spon_lines = spon_row.text_content().replace('\r\n', ';') for leg in spon_lines.split(';'): name = leg.replace(u'\xa0', ' ').strip() if name.startswith('(Primary)') or name.endswith('(Primary)'): name = name.replace('(Primary)', '').strip() spon_type = 'cosponsor' if not name: continue bill.add_sponsorship(name, classification=spon_type, entity_type='person', primary=(spon_type == 'primary')) # keywords kw_row = doc.xpath('//div[contains(text(), "Keywords:")]/following-sibling::div')[0] for subject in kw_row.text_content().split(', '): bill.add_subject(subject) # actions action_tr_xpath = ( '//h6[contains(text(), "History")]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '/div[@class="row"]' ) # skip two header rows for row in doc.xpath(action_tr_xpath): cols = row.xpath('div') act_date = cols[1].text actor = cols[3].text or '' # if text is blank, try diving in action = (cols[5].text or '').strip() or cols[5].text_content().strip() act_date = dt.datetime.strptime(act_date, '%m/%d/%Y').strftime('%Y-%m-%d') if actor == 'Senate': actor = 'upper' elif actor == 'House': actor = 'lower' else: actor = 'executive' for pattern, atype in self._action_classifiers.items(): if action.startswith(pattern): break else: atype = None bill.add_action(action, act_date, chamber=actor, classification=atype) # TODO: Fix vote scraper # yield from self.scrape_votes(bill, doc) yield bill
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] if not all((date, title)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] bill = Bill(identifier=matter['MatterFile'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Chicago City Council"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format( matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] if body_name != 'City Council': act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_version_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+bill_type',None) old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if classification == ['address'] and self.state == 'nh': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor in ('upper','council') and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids', '+additional_information'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = VoteEvent(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def parse_bill_status_page(self, url, page, session, chamber): # see 2007 HB 2... weird. parsed_url = urllib.parse.urlparse(url) parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query)) bill_id = "{0} {1}".format( parsed_query['P_BLTP_BILL_TYP_CD'], parsed_query['P_BILL_NO1']) try: xp = '//b[text()="Short Title:"]/../following-sibling::td/text()' title = page.xpath(xp).pop() except IndexError: title = page.xpath('//tr[1]/td[2]')[0].text_content() # Add bill type. _bill_id = bill_id.lower() if 'b' in _bill_id: classification = 'bill' elif 'j' in _bill_id or 'jr' in _bill_id: classification = 'joint resolution' elif 'cr' in _bill_id: classification = 'concurrent resolution' elif 'r' in _bill_id: classification = 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) self.add_actions(bill, page) votes = self.add_votes(bill, page, url) tabledata = self._get_tabledata(page) # Add sponsor info. bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary', entity_type='person', primary=True) # A various plus fields MT provides. plus_fields = [ 'requester', ('chapter number:', 'chapter'), 'transmittal date:', 'drafter', 'fiscal note probable:', 'bill draft number:', 'preintroduction required:', 'by request of', 'category:'] for x in plus_fields: if isinstance(x, tuple): _key, key = x else: _key = key = x key = key.replace(' ', '_') try: val = tabledata[_key] except KeyError: continue if len(val) == 1: val = val[0] bill.extras[key] = val # Add bill subjects. xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr' subjects = [] for tr in page.xpath(xp): try: subj = tr.xpath('td')[0].text_content() except IndexError: continue subjects.append(subj) for s in subjects: bill.add_subject(s) self.add_fiscal_notes(page, bill) return bill, list(votes)
def scrape(self): state = 'MN' session = self.jurisdiction.legislative_sessions[0] apiKey = 'd2c0db7e-6a6e-4606-a9b0-83c18e647ff6' pyopenstates.set_api_key(apiKey) bills_upper = pyopenstates.search_bills(state=state, chamber="upper", updated_since="2017-01-01") bills_lower = pyopenstates.search_bills(state=state, chamber="lower", updated_since="2017-01-01") for b in bills_lower: number = b['bill_id'] title = b['title'] bill_id = b['id'] dbill = pyopenstates.get_bill(bill_id) url = dbill['sources'][0]['url'] bill = Bill(identifier=number, legislative_session=session['identifier'], title=title, classification=b['type'][0], chamber='upper') bill.add_source(url) bill.add_identifier(bill_id, scheme='openstatesv1') subjects = b['subjects'] for s in subjects: bill.add_subject(s) sponsors = dbill['sponsors'] for sponsor in sponsors: if not sponsor['leg_id'] == None: l = pyopenstates.get_legislator(sponsor['leg_id']) full_name = l['full_name'].split(' ') if len(full_name) == 3: full_name.pop(1) full_name = (' ').join(full_name) primary = False if sponsor['type'] == 'primary': primary = True try: bill.add_sponsorship(name=full_name, classification=sponsor['type'], entity_type='person', primary=primary) except: pass actions = dbill['actions'] for act in actions: action = act['action'] actor = act['actor'] date = tz.localize(datetime.strptime(act['date'], DATE_FORMAT)) Action_Type = act['type'] bill.add_action(action, date, chamber=actor) action_dates = dbill['action_dates'] for act in action_dates.items(): k, v = act[0], act[1] if '_' in k: chamber = k.split('_')[1] elif k == 'signed': chamber = 'executive' else: chamber = None k.replace('_', ' ') if not v == None and not k in ['first', 'last']: bill.add_action(k, tz.localize(v), chamber=chamber) yield bill for b in bills_upper: number = b['bill_id'] title = b['title'] bill_id = b['id'] dbill = pyopenstates.get_bill(bill_id) url = dbill['sources'][0]['url'] bill = Bill(identifier=number, legislative_session=session['identifier'], title=title, classification=b['type'][0], chamber='upper') bill.add_source(url) bill.add_identifier(bill_id, scheme='openstatesv1') subjects = b['subjects'] for s in subjects: bill.add_subject(s) sponsors = dbill['sponsors'] for sponsor in sponsors: if not sponsor['leg_id'] == None: l = pyopenstates.get_legislator(sponsor['leg_id']) full_name = l['full_name'].split(' ') if len(full_name) == 3: full_name.pop(1) full_name = (' ').join(full_name) primary = False if sponsor['type'] == 'primary': primary = True try: bill.add_sponsorship(name=full_name, classification=sponsor['type'], entity_type='person', primary=primary) except: pass actions = dbill['actions'] for act in actions: action = act['action'] actor = act['actor'] date = tz.localize(datetime.strptime(act['date'], DATE_FORMAT)) Action_Type = act['type'] bill.add_action(action, date, chamber=actor) action_dates = dbill['action_dates'] for act in action_dates.items(): k, v = act[0], act[1] if '_' in k: chamber = k.split('_')[1] elif k == 'signed': chamber = 'executive' else: chamber = None k.replace('_', ' ') if not v == None and not k in ['first', 'last']: bill.add_action(k, tz.localize(v), chamber=chamber) yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and ( date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view': self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'].strip(), media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill(self, chamber, session, bill_id): # there will be a space in bill_id if we're doing a one-off bill scrape # convert HB 102 into H102 if ' ' in bill_id: bill_id = bill_id[0] + bill_id.split(' ')[-1] # if chamber comes in as House/Senate convert to lower/upper if chamber == 'Senate': chamber = 'upper' elif chamber == 'House': chamber = 'lower' bill_detail_url = ( 'http://www.ncleg.net/gascripts/' 'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all' ) % (session, bill_id) # parse the bill data page, finding the latest html text data = self.get(bill_detail_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(bill_detail_url) title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0] if 'Joint Resolution' in title_div_txt: bill_type = 'joint resolution' bill_id = bill_id[0] + 'JR ' + bill_id[1:] elif 'Resolution' in title_div_txt: bill_type = 'resolution' bill_id = bill_id[0] + 'R ' + bill_id[1:] elif 'Bill' in title_div_txt: bill_type = 'bill' bill_id = bill_id[0] + 'B ' + bill_id[1:] bill_title = doc.xpath( '//div[contains(@class, "h5")]')[0].text_content().strip() bill = Bill(bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=bill_type) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == 'lower': link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace(u'\xa0', ' ') version_url = vlink.attrib['href'] media_type = 'text/html' if version_url.lower().endswith(".pdf"): media_type = 'application/pdf' bill.add_version_link(version_name, version_url, media_type=media_type, on_duplicate='ignore') # sponsors spon_row = doc.xpath( '//div[contains(text(), "Sponsors")]/following-sibling::div')[0] # first sponsors are primary, until we see (Primary) spon_type = 'primary' for leg in spon_row.text_content().split(';'): name = leg.replace(u'\xa0', ' ').strip() if name.startswith('(Primary)'): name = name.replace('(Primary)', '').strip() spon_type = 'cosponsor' if not name: continue bill.add_sponsorship(name, classification=spon_type, entity_type='person', primary=(spon_type == 'primary')) # keywords kw_row = doc.xpath( '//div[contains(text(), "Keywords:")]/following-sibling::div')[0] for subject in kw_row.text_content().split(', '): bill.add_subject(subject) # actions action_tr_xpath = ('//h6[contains(text(), "History")]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '/div[@class="row"]') # skip two header rows for row in doc.xpath(action_tr_xpath): cols = row.xpath('div') act_date = cols[1].text actor = cols[3].text or '' # if text is blank, try diving in action = (cols[5].text or '').strip() or cols[5].text_content().strip() act_date = dt.datetime.strptime(act_date, '%m/%d/%Y').strftime('%Y-%m-%d') if actor == 'Senate': actor = 'upper' elif actor == 'House': actor = 'lower' else: actor = 'executive' for pattern, atype in self._action_classifiers.items(): if action.startswith(pattern): break else: atype = None bill.add_action(action, act_date, chamber=actor, classification=atype) yield from self.scrape_votes(bill, doc) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) self._bill_prefix_map = { "HB": {"type": "bill", "url_segment": "bills/house"}, "HR": {"type": "resolution", "url_segment": "resolutions/house/simple"}, "HCR": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJR": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "HC": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJ": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "SB": {"type": "bill", "url_segment": "bills/senate"}, "SR": {"type": "resolution", "url_segment": "resolutions/senate/simple"}, "SCR": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJR": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, "SC": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJ": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, } api_base_url = "https://api.iga.in.gov" proxy = {"url": "http://in-proxy.openstates.org"} # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # sunlight's put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue disp_bill_id = bill_id[:idx] + " " + str(int(bill_id[idx:])) break bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue title = bill_json["title"] if title == "NoneNone": title = None # sometimes title is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ( "lower" if bill_json["originChamber"].lower() == "house" else "upper" ) bill_type = self._bill_prefix_map[bill_prefix]["type"] bill = Bill( disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type, ) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: bill.add_sponsorship( classification="author", name=self._get_name(s), entity_type="person", primary=True, ) for s in bill_json["coauthors"]: bill.add_sponsorship( classification="coauthor", name=self._get_name(s), entity_type="person", primary=False, ) for s in bill_json["sponsors"]: bill.add_sponsorship( classification="sponsor", name=self._get_name(s), entity_type="person", primary=True, ) for s in bill_json["cosponsors"]: bill.add_sponsorship( classification="cosponsor", name=self._get_name(s), entity_type="person", primary=False, ) # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get( "bill_actions", session=session, bill_id=bill_id.lower() ) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace("T", " ") # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if "second reading" in d or "reread second time" in d: action_type.append("reading-2") reading = True if "third reading" in d or "reread third time" in d: action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ( "referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d ): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning( "Could not recognize an action in '{}'".format(action_desc) ) action_type = None a = bill.add_action( chamber=action_chamber, description=action_desc, date=date, classification=action_type, ) if committee: a.add_related_entity(committee, entity_type="organization") # subjects subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] for subject in subjects: bill.add_subject(subject) # versions and votes for version in bill_json["versions"][::-1]: try: version_json = client.get( "bill_version", session=session, bill_id=version["billName"], version_id=version["printVersionName"], ) except scrapelib.HTTPError: self.logger.warning("Bill version does not seem to exist.") continue yield from self.deal_with_version( version_json, bill, bill_id, original_chamber, session, proxy ) yield bill
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Check if bill hasn't been transmitted to the other chamber yet transmit_check = self.get_node( doc, '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()' ) if (transmit_check is not None and 'has not been transmitted' in transmit_check.strip()): self.logger.debug('Bill has not been transmitted to other chamber ' '... skipping {0}'.format(bill_detail_url)) return # Get the basic parts of the bill bill_id = self.get_node(doc, '//h1[contains(@class,"card-title float-left mr-4")]/text()') self.logger.debug(bill_id) bill_title_text = self.get_node( doc, '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()' ) if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node( doc, '//a[text()[contains(.,"Long Description")]]/@href' ) long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node(long_desc_page, '//h1/' 'following-sibling::p/text()') if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = 'No title found.' self.logger.warning('No title found for {}.'.format(bill_id)) self.logger.debug(bill_title) bill_type = {'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution'}[bill_id[1].upper()] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) # Add source bill.add_source(bill_detail_url) for subject in self._subject_mapping[bill_id]: bill.add_subject(subject) # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]' '/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if "*" in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(" ", " ") bill_id = bill_id.replace("*", "").replace(" ", " ").strip() if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" primary_chamber = "lower" if "H" in bill_id else "upper" # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = "%s detail page was missing title info." self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find("-") subjects = [s.strip() for s in title[: subject_pos - 1].split(",")] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) if page.xpath('//span[@id="lblCompNumber"]/a'): companion_id = page.xpath('//span[@id="lblCompNumber"]/a')[0].text_content().strip() bill.add_related_bill( identifier=companion_id, legislative_session=session, relation_type="companion", ) bill.add_source(bill_url) # Primary Sponsor sponsor = ( page.xpath("//span[@id='lblBillPrimeSponsor']")[0] .text_content() .split("by")[-1] ) sponsor = sponsor.replace("*", "").strip() if sponsor: bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True ) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link( "Current Version", btext.get("href"), media_type="application/pdf" ) # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link("Summary", summary[0].get("href")) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link("Fiscal Note", fiscal[0].get("href")) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_document_link("Amendment " + amendment.text, amendment.get("href")) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link( afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore" ) # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = ( page.xpath("//span[@id='lblCompPrimeSponsor']")[0] .text_content() .split("by")[-1] ) secondary_sponsor = ( secondary_sponsor.replace("*", "").replace(")", "").strip() ) # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification="primary", entity_type="person", primary=True, ) # secondary actions cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a["date"]) yield bill
def scrape_matter(self, matter_link, sess): matter_types = { "Additions": "other", "Administrative Order": "order", "Annual Evaluation": "other", "Bid Advertisement": "other", "Bid Awards": "other", "Bid Contract": "contract", "Bid Protest": "other", "Bid Rejection": "other", "Birthday Scroll": "commemoration", "Certificate of Appreciation": "commemoration", "Change Order": "order", "Citizen's Presentation": "other", "Commendation": "commemoration", "Conflict Waiver": "other", "Congratulatory Certificate": "commemoration", "Deferrals": "other", "Discussion Item": "other", "Distinguished Visitor": "other", "Joint Meeting/Workshop": "other", "Mayoral Veto": "other", "Miscellaneous": "other", "Nomination": "nomination", "Oath of Office": "other", "Omnibus Reserve": "bill", "Ordinance": "ordinance", "Plaque": "commemoration", "Presentation": "other", "Proclamation": "proclamation", "Professional Service Agreement": "contract", "Public Hearing": "other", "Report": "other", "Request for Proposals": "other", "Request for Qualifications": "other", "Request to Advertise": "other", "Resolution": "resolution", "Resolution of Sympathy": "resolution", "Service Awards": "commemoration", "Special Item": "other", "Special Presentation": "other", "Supplement": "other", "Swearing-In": "other", "Time Sensitive Items": "other", "Withdrawals": "other", "Workshop Item": "other", "Zoning": "other", "Zoning Resolution": "resolution" } matter_doc = self.lxmlize(matter_link) info_dict = self.matter_table_to_dict(matter_doc) #we're going to use the year of the intro date as the session #until/unless we come up with something better intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y") session = sess["identifier"] category = matter_types[info_dict["File Type"]] if 'File Name' in info_dict: title = info_dict["File Name"] elif "Title" in info_dict and info_dict["Title"].strip(): title = info_dict["Title"].strip() else: self.warning("bill has no title") return if category == 'other': bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title) else: bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title, classification=category) for spons in info_dict["Sponsors"]: if spons == "NONE": continue try: name, spons_type = spons.rsplit(",", 1) except ValueError: name = spons spons_type = "Sponsor" primary = True if "Prime Sponsor" in spons_type else False entity = "person" if "committee" in name: entity = committee bill.add_sponsorship(name, spons_type, entity, primary) if "Indexes" in info_dict: for subj in info_dict["Indexes"]: if subj.strip() and subj.strip() != "NONE": bill.add_subject(subj.strip()) if "Title" in info_dict and info_dict["Title"].strip(): note = "bill's long title'" if ("Note" in info_dict and info_dict["Note"].strip()): note = info_dict["Note"] bill.add_abstract(abstract=info_dict["Title"], note=note) self.process_action_table(matter_doc, bill) bill.add_source(matter_link, note='web') yield bill
def parse_bill_status_page(self, status_url, bill_url, session, chamber): status_page = lxml.html.fromstring(self.get(status_url).text) status_page.make_links_absolute(status_url) # see 2007 HB 2... weird. bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf' bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href' bill_id = re.search(bill_re, status_page.xpath(bill_xpath)[0], re.IGNORECASE).groups() bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1])) try: xp = '//b[text()="Short Title:"]/../following-sibling::td/text()' title = status_page.xpath(xp).pop() except IndexError: title = status_page.xpath('//tr[1]/td[2]')[0].text_content() # Add bill type. _bill_id = bill_id.lower() if 'b' in _bill_id: classification = 'bill' elif 'j' in _bill_id or 'jr' in _bill_id: classification = 'joint resolution' elif 'cr' in _bill_id: classification = 'concurrent resolution' elif 'r' in _bill_id: classification = 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) self.add_actions(bill, status_page) votes = self.add_votes(bill, status_page, status_url) tabledata = self._get_tabledata(status_page) # Add sponsor info. bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary', entity_type='person', primary=True) # A various plus fields MT provides. plus_fields = [ 'requester', ('chapter number:', 'chapter'), 'transmittal date:', 'drafter', 'fiscal note probable:', 'bill draft number:', 'preintroduction required:', 'by request of', 'category:'] for x in plus_fields: if isinstance(x, tuple): _key, key = x else: _key = key = x key = key.replace(' ', '_') try: val = tabledata[_key] except KeyError: continue if len(val) == 1: val = val[0] bill.extras[key] = val # Add bill subjects. xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr' subjects = [] for tr in status_page.xpath(xp): try: subj = tr.xpath('td')[0].text_content() except IndexError: continue subjects.append(subj) for s in subjects: bill.add_subject(s) self.add_fiscal_notes(status_page, bill) return bill, list(votes)
def scrape(self, window=30): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) self.retry_wait_seconds = 20 for matter in self.matters(n_days_ago): matter_id = matter["MatterId"] date = matter["MatterIntroDate"] title = matter["MatterTitle"] identifier = matter["MatterFile"] # If a bill has a duplicate action item that"s causing the entire scrape # to fail, add it to the `problem_bills` array to skip it. # For the time being...nothing to skip! problem_bills = [] if identifier in problem_bills: continue if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) if matter["MatterTypeName"] in BILL_TYPES: ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]] else: ocd_bill_type = None if identifier.startswith("S"): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=ocd_bill_type, from_organization={"name": "Pittsburgh City Council"}) legistar_web = matter["legistar_url"] legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id) bill.add_source(legistar_web, note="web") bill.add_source(legistar_api, note="api") for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): responsible_person = action.pop("responsible person") act = bill.add_action(**action) if responsible_person: act.add_related_entity(responsible_person, "person", entity_id=_make_pseudo_id(name=responsible_person)) if action["description"] == "Referred": body_name = matter["MatterBodyName"] if body_name != "City Council": act.add_related_entity(body_name, "organization", entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action["description"], organization=action["organization"], classification=None, start_date=action["date"], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + "/histories") for vote in votes: raw_option = vote["VoteValueName"].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote["VotePersonName"].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic["MatterIndexName"].strip()) for attachment in self.attachments(matter_id): if attachment["MatterAttachmentName"]: bill.add_version_link(attachment["MatterAttachmentName"], attachment["MatterAttachmentHyperlink"], media_type="application/pdf") bill.extras = {"local_classification": matter["MatterTypeName"]} text = self.text(matter_id) if text: if text["MatterTextPlain"]: bill.extras["plain_text"] = text["MatterTextPlain"] if text["MatterTextRtf"]: bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "") yield bill
def scrape_details(self, bill_detail_url, session, chamber, bill_id): """ Create the Bill and add the information obtained from the provided bill_detail_url. and then yield the bill object. :param bill_detail_url: :param session: :param chamber: :param bill_id: :return: """ page = self.get(bill_detail_url).text if 'INVALID BILL NUMBER' in page: self.warning('INVALID BILL %s' % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath('span/text()')[0] if 'General Bill' in bill_type: bill_type = 'bill' elif 'Concurrent Resolution' in bill_type: bill_type = 'concurrent resolution' elif 'Joint Resolution' in bill_type: bill_type = 'joint resolution' elif 'Resolution' in bill_type: bill_type = 'resolution' else: raise ValueError('unknown bill type: %s' % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill( bill_id, legislative_session= session, # session name metadata's `legislative_sessions` chamber=chamber, # 'upper' or 'lower' title=bill_summary, classification=bill_type) subjects = list(self._subjects[bill_id]) for subject in subjects: bill.add_subject(subject) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsorship(name=sponsor, classification='primary', primary=True, entity_type='person') for sponsor in doc.xpath( '//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace(u'\xa0', ' ').strip() bill.add_sponsorship(name=sponsor, classification='primary', primary=True, entity_type='organization') # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.get(version_url).text version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version_link( note=version. text, # Description of the version from the state; # eg, 'As introduced', 'Amended', etc. url=version.get('href'), on_duplicate='ignore', media_type='text/html' # Still a MIME type ) # actions for row in bill_div.xpath('table/tr'): date_td, chamber_td, action_td = row.xpath('td') date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = { 'Senate': 'upper', 'House': 'lower', None: 'legislature' }[chamber_td.text] action = action_td.text_content() action = action.split('(House Journal')[0] action = action.split('(Senate Journal')[0].strip() atype = action_type(action) bill.add_action( description=action, # Action description, from the state date=date.strftime('%Y-%m-%d'), # `YYYY-MM-DD` format chamber=action_chamber, # 'upper' or 'lower' classification=atype # Options explained in the next section ) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] yield from self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): spacer, number_link, _ga, title, primary_sponsor, status, spacer = row.xpath('td') # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace('No.', '') bill_id = bill_id.replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) title = title.text_content().strip() title = re.sub(r'^Title', '', title) chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime(data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action(effective_action, effective_date, chamber="executive", classification=["became-law"]) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype ) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link( link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore' ) sponsor_links = page.xpath( "//td[contains(@id, 'tdSponsors')]/a") for link in sponsor_links: bill.add_sponsorship( link.text, classification='primary', primary=True, entity_type='person' ) actor = chamber use_row = False self.debug(bill_id) for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"): if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') elif action.startswith('Signed by Governor'): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match('\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "http://legislature.idaho.gov/legislation/%s/" % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + "/" + session[0:4], "%m/%d/%Y").strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u"\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # If this Boolean field is True, then do not scrape the Bill. # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API: # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826 if matter['MatterRestrictViewViaWeb']: continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): number_link, ga, title, primary_sponsor, status = row.xpath('td') bill_id = number_link.text_content() title = title.text_content().strip() chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = [ 'Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York' ] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def scrape(self): session_name = self.latest_session() session = session_name[0:5] self._bill_prefix_map = { 'HB': { 'type': 'bill', 'url_segment': 'bills/house', }, 'HR': { 'type': 'resolution', 'url_segment': 'resolutions/house/simple', }, 'HCR': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/house/concurrent', }, 'HJR': { 'type': 'joint resolution', 'url_segment': 'resolutions/house/joint' }, 'HC': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/house/concurrent', }, 'HJ': { 'type': 'joint resolution', 'url_segment': 'resolutions/house/joint', }, 'SB': { 'type': 'bill', 'url_segment': 'bills/senate', }, 'SR': { 'type': 'resolution', 'url_segment': 'resolutions/senate/simple', }, 'SCR': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/senate/concurrent', }, 'SJR': { 'type': 'joint resolution', 'url_segment': 'resolutions/senate/joint', }, 'SC': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/senate/concurrent', }, 'SJ': { 'type': 'joint resolution', 'url_segment': 'resolutions/senate/joint', }, } api_base_url = "https://api.iga.in.gov" proxy = {"url": "http://in-proxy.openstates.org"} # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # sunlight's put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue disp_bill_id = bill_id[:idx]+" "+str(int(bill_id[idx:])) break bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning('Bill could not be accessed. Skipping.') continue title = bill_json["title"] if title == "NoneNone": title = None # sometimes title is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ("lower" if bill_json["originChamber"].lower() == "house" else "upper") bill_type = self._bill_prefix_map[bill_prefix]['type'] bill = Bill(disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: bill.add_sponsorship(classification="author", name=self._get_name(s), entity_type='person', primary=True) for s in bill_json["coauthors"]: bill.add_sponsorship(classification="coauthor", name=self._get_name(s), entity_type='person', primary=False) for s in bill_json["sponsors"]: bill.add_sponsorship(classification="sponsor", name=self._get_name(s), entity_type='person', primary=True) for s in bill_json["cosponsors"]: bill.add_sponsorship(classification="cosponsor", name=self._get_name(s), entity_type='person', primary=False) # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get("bill_actions", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace('T', ' ') # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if ("second reading" in d or "reread second time" in d): action_type.append("reading-2") reading = True if ("third reading" in d or "reread third time" in d): action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ("referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning("Could not recognize an action in '{}'".format( action_desc)) action_type = None a = bill.add_action(chamber=action_chamber, description=action_desc, date=date, classification=action_type) if committee: a.add_related_entity(committee, entity_type='organization') # subjects subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] for subject in subjects: bill.add_subject(subject) # versions and votes for version in bill_json["versions"][::-1]: try: version_json = client.get("bill_version", session=session, bill_id=version["billName"], version_id=version["printVersionName"]) except scrapelib.HTTPError: self.logger.warning("Bill version does not seem to exist.") continue yield from self.deal_with_version(version_json, bill, bill_id, original_chamber, session, proxy) yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") doc_types = ["bills", "resolutions"] for doc_type in doc_types: bill_versions = {} for doc in self.pages(base_url, first_page+doc_type): for v in doc["items"]: # bills is actually a list of versions # going to create a dictionary as follows: # key=bill_id # value = dict of all versions, where keys are versionids # and values are the bill data from the api. # then loop through that to avoid duplicate bills try: bill_id = v["number"] except KeyError: self.warning("Apparent bill has no information:\n{}".format(v)) continue version_id = v["versionid"] if bill_id in bill_versions: if version_id in bill_versions[bill_id]: self.logger.warning("There are two versions of {bill_id}" " called the same thing." " Bad news bears!".format(bill_id=bill_id)) else: bill_versions[bill_id][version_id] = v else: bill_versions[bill_id] = {} bill_versions[bill_id][version_id] = v for b in bill_versions: bill = None for bill_version in bill_versions[b].values(): if not bill: bill_id = bill_version["number"] title = bill_version["shorttitle"] or bill_version["longtitle"] title = title.strip() if not len(title): self.warning("Missing title for {bill_id}".format(bill_id=bill_id)) next chamber = "lower" if "h" in bill_id else "upper" subjects = [] for subj in bill_version["subjectindexes"]: try: subjects.append(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: subjects.append(secondary_subj) # they use bill id of format HB 1 on the site # but hb1 in the API. for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue display_id = bill_id[:idx]+" "+bill_id[idx:] break classification = {'bills': 'bill', 'resolutions': 'resolution'}[doc_type] bill = Bill(display_id.upper(), legislative_session=session, chamber=chamber, title=title, classification=classification ) for subject in subjects: bill.add_subject(subject) # this stuff is the same for all versions bill.add_source(first_page+doc_type+"/"+bill_id) sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) # this stuff is version-specific version_name = bill_version["version"] version_link = base_url+bill_version["pdfDownloadLink"] if version_link.endswith("pdf"): mimetype = "application/pdf" else: mimetype = "application/octet-stream" bill.add_version_link(version_name, version_link, media_type=mimetype) # Need to sort bill actions, since they may be jumbled yield bill