def scrape_bill(self, row, chamber, session): bill_id = row['LegislationNumber'] # TODO: re-evaluate if these should be separate bills if 'SA' in bill_id or 'HA' in bill_id: self.warning('skipping amendment %s', bill_id) return bill_type = self.classify_bill(bill_id) bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a' additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a' cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' # on_duplicate='error' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = "{}{}".format(qs["billtype"], qs["billnumber"]) versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta["Report Title"].split(";")] if "" in subs: subs.remove("") b = Bill( bill_id, session, meta["Measure Title"], chamber=chamber, classification=bill_type, ) if meta["Description"]: b.add_abstract(meta["Description"], "description") for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = "{} Regular Session".format(str(int(session[:4]) - 1)) companion = meta["Companion"].strip() if companion: b.add_related_bill( identifier=companion.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) if bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" ): prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if "carried over" in prior.lower(): b.add_related_bill( identifier=bill_id.replace(u"\xa0", " "), legislative_session=prior_session, relation_type="companion", ) for sponsor in meta["Introducer(s)"]: b.add_sponsorship(sponsor, "primary", "person", True) versions = self.parse_bill_versions_table(b, versions) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id) try: response = requests.get(bill_url) except requests.exceptions.RequestException as e: self.warning(u'Server Error on {}'.format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning(u'Server Error on {}'.format(bill_url)) return False bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0] bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification='bill') bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' 'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]') if sponsor: sponsor = sponsor[0].strip() bill.add_sponsorship(sponsor, classification='primary', primary=True, entity_type='person') self.scrape_cosponsors(bill, bill_url) version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link('Bill Text', version_url, media_type='application/pdf') # yield back votes and bill yield from self.scrape_actions(bill, bill_url, session) yield bill
def scrape_bills(self, session): session_key = SESSION_KEYS[session] measures_response = self.api_client.get('measures', page=500, session=session_key) legislators = index_legislators(self, session_key) for measure in measures_response: bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber']) chamber = self.chamber_code[bid[0]] bill = Bill( bid.replace(' ', ''), legislative_session=session, chamber=chamber, title=measure['RelatingTo'], classification=self.bill_types[measure['MeasurePrefix'][1:]] ) bill.add_abstract(measure['MeasureSummary'].strip(), note='summary') for sponsor in measure['MeasureSponsors']: legislator_code = sponsor['LegislatoreCode'] # typo in API if legislator_code: try: legislator = legislators[legislator_code] except KeyError: logger.warn('Legislator {} not found in session {}'.format( legislator_code, session)) legislator = legislator_code bill.add_sponsorship( name=legislator, classification={'Chief': 'primary', 'Regular': 'cosponsor'}[ sponsor['SponsorLevel']], entity_type='person', primary=True if sponsor['SponsorLevel'] == 'Chief' else False ) bill.add_source( "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format( session=session_key, bid=bid.replace(' ', '')) ) for document in measure['MeasureDocuments']: # TODO: probably mixing documents & versions here - should revisit try: bill.add_version_link(document['VersionDescription'], document['DocumentUrl'], media_type='application/pdf') except ValueError: logger.warn('Duplicate link found for {}'.format(document['DocumentUrl'])) for action in measure['MeasureHistoryActions']: classifiers = self.determine_action_classifiers(action['ActionText']) when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S') when = self.tz.localize(when) bill.add_action(action['ActionText'], when, chamber=self.chamber_code[action['Chamber']], classification=classifiers) yield bill
def scrape_bill(self, bill_url, bill_id, session_id): page = self.lxmlize(bill_url) # create bill title = page.xpath("//em/text()")[0] bill = Bill(identifier=bill_id, legislative_session=session_id, title=title) bill.add_source(bill_url, note="detail") # add additional fields data_table = page.xpath("//table[@class='data vertical_table']")[0] # sponsor sponsor_name = data_table.xpath(self.bill_table_query("Sponsor"))[0] bill.add_sponsorship(name=sponsor_name, classification="Primary", entity_type="person", primary=True) # abstract try: summary = data_table.xpath(self.bill_table_query("Summary"))[0] bill.add_abstract(abstract=summary, note="summary") # TODO trim whitespace from summary except IndexError: print("No summary for bill {} in session {}".format( bill_id, session_id)) # actions action_lines = data_table.xpath(self.bill_table_query("Actions")) for line in action_lines: try: for date_str, action_type in self.parse_actions(line): bill.add_action(date=date_str, description=action_type, classification=action_type) print("added action: {}".format(action_type)) except ValueError: print("failed to parse these actions: {}".format([line])) # co-sponsors co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors")) co_sponsors = [name.strip() for name in co_sponsors if name.strip()] for name in co_sponsors: bill.add_sponsorship(name=name, classification="co-sponsor", entity_type="person", primary=False) return bill
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1) # bill data gets filled in from another call bill_data_base = ( "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/" "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}" ) bill_data_url = bill_data_base.format( session_slug, internal_id, time.time() * 1000 ) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, "Summary:").text short_title = short_title.replace("\u00a0", " ") bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber, ) long_title = self.get_header_field(bill_page, "Title:").text if long_title is not None: bill.add_abstract(long_title, "Summary") sponsor_div = self.get_header_field(bill_page, "Primary Sponsor") if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, "primary") cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor") if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, "cosponsor") self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr bill.extras["NV_ID"] = internal_id bill.add_source(url) yield bill
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1) # bill data gets filled in from another call bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \ 'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}' bill_data_url = bill_data_base.format( session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, 'Summary:').text short_title = short_title.replace(u'\u00a0', ' ') bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber ) long_title = self.get_header_field(bill_page, 'Title:').text if long_title is not None: bill.add_abstract(long_title, 'Summary') sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor') if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, 'primary') cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor') if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, 'cosponsor') self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras['BDR'] = bdr bill.extras['NV_ID'] = internal_id bill.add_source(url) yield bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = '{}{}'.format(qs['billtype'], qs['billnumber']) versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta['Report Title'].split(";")] if "" in subs: subs.remove("") b = Bill(bill_id, session, meta['Measure Title'], chamber=chamber, classification=bill_type) if meta['Description']: b.add_abstract(meta['Description'], 'description') for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1)) companion = meta['Companion'].strip() if companion: b.add_related_bill(identifier=companion.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1] if 'carried over' in prior.lower(): b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") for sponsor in meta['Introducer(s)']: b.add_sponsorship(sponsor, 'primary', 'person', True) versions = self.parse_bill_versions_table(b, versions) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error('Skipping %s w/ 503', bill_url) return else: raise bill_number = page.xpath( '//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()')[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])') bill_summary = bill_summary.strip() bill = Bill( bill_number, legislative_session=session, chamber=chamber, title=bill_title, ) if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source('{}{}'.format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_amendments(bill, page) yield bill yield from self.scrape_votes(bill, page)
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error('Skipping %s w/ 503', bill_url) return else: raise bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()')[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])') bill_summary = bill_summary.strip() bill = Bill( bill_number, legislative_session=session, chamber=chamber, title=bill_title, ) if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source('{}{}'.format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_amendments(bill, page) yield bill yield from self.scrape_votes(bill, page)
def scrape_bill_2012(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath( '//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip() if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] bill = Bill( bill_id, legislative_session=session, classification=_type, chamber=chamber, title=title, ) bill.add_abstract(summary, note='summary') bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions yield from self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill.subject = subjects # add bill to collection self.save_bill(bill)
def scrape_bill_2012(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip() if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] bill = Bill( bill_id, legislative_session=session, classification=_type, chamber=chamber, title=title, ) bill.add_abstract(summary, note='summary') bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions yield from self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill.subject = subjects # add bill to collection self.save_bill(bill)
def scrape_bill(self, bill_url, bill_id, session_id): page = self.lxmlize(bill_url) # create bill title = page.xpath("//h1/text()")[0] bill = Bill(identifier=bill_id, legislative_session=session_id, title=title) bill.add_source(bill_url, note="detail") # add additional fields # abstract try: # abstract is directly above <h2>Legislative History</h2> leg_his = page.xpath("//h2[text()='Legislative History']")[0] abstract = leg_his.xpath("preceding-sibling::p/text()")[0] bill.add_abstract(abstract=abstract.strip(), note="summary") # TODO trim whitespace from summary except IndexError: print("No abstract for bill {} in session {}".format(bill_id, session_id)) # the rest of the fields are found inside this <table> data_table = page.xpath("//table[contains(@class, 'data')]")[0] # sponsor sponsor_name = data_table.xpath(self.bill_table_query("Sponsor") + "/text()")[0] bill.add_sponsorship(name=sponsor_name, classification="Primary", entity_type="person", primary=True ) # actions action_lines = data_table.xpath(self.bill_table_query("Actions") + "/text()") for line in action_lines: line = line.join('') try: for date_str, action_type in self.parse_actions(line): bill.add_action(date=date_str, description=action_type, classification=action_type) except ValueError: print("failed to parse these actions: {}".format([line])) # co-sponsors co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors") + "/text()") co_sponsors = [name.strip() for name in co_sponsors if name.strip()] for name in co_sponsors: bill.add_sponsorship(name=name, classification="co-sponsor", entity_type="person", primary=False) # committee (stored as another sponsorship in OCD) committees = data_table.xpath(self.bill_table_query("Committee") + "/a/text()") for comm in committees: bill.add_sponsorship(name=comm, classification="secondary", # classification ? entity_type="organization", primary=False) return bill
def get_bill(self, matter): '''Make Bill object from given matter.''' matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) bill = Bill( bill_id, title=bill_desc, legislative_session=year, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) yield bill
def scrape_bill(self, row, session): bill_id = row["LegislationDisplayCode"] amendment = None substitute = None if bill_id.count(" ") > 1: if " w/ " in bill_id: self.info("Found amended bill `{}`".format(bill_id)) bill_id, amendment = bill_id.split(" w/ ") # A bill can _both_ be amended and be substituted if " for " in bill_id: self.info("Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(" for ") if amendment is None and substitute is None: raise ValueError("unknown bill_id format: " + bill_id) bill_type = self.classify_bill(bill_id) chamber = "upper" if bill_id.startswith("S") else "lower" bill = Bill( identifier=bill_id, legislative_session=session, chamber=chamber, title=row["LongTitle"], classification=bill_type, ) if row["Synopsis"]: bill.add_abstract(row["Synopsis"], "synopsis") if row["ShortTitle"]: bill.add_title(row["ShortTitle"], "short title") if row["SponsorPersonId"]: self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"], "primary") if substitute: bill.extras["substitute"] = substitute if amendment: bill.extras["amendment"] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format( row["LegislationId"] ) bill.add_source(html_url, note="text/html") html = self.lxmlize(html_url) additional_sponsors = html.xpath( '//label[text()="Additional Sponsor(s):"]' "/following-sibling::div/a/@href" ) for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "" ) self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary") cosponsors = html.xpath( '//label[text()="Co-Sponsor(s):"]/' "following-sibling::div/a/@href" ) for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace( "https://legis.delaware.gov/LegislatorDetail?" "personId=", "" ) self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor") versions = html.xpath( '//label[text()="Original Text:"]/following-sibling::div/a/@href' ) for version_url in versions: media_type = self.mime_from_link(version_url) version_name = "Bill Text" bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row["LegislationId"]) if row["HasAmendments"] is True: self.scrape_amendments(bill, row["LegislationId"]) yield from self.scrape_votes(bill, row["LegislationId"], session) yield bill
def scrape(self, session=None, chamber=None): bill_type_map = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", } chamber_map = { "H": "lower", "S": "upper", "J": "joint", "E": "legislature", # Effective date } action_code_map = { "HI": None, "SI": None, "HH": None, "SH": None, "HPF": ["introduction"], "HDSAS": None, "SPF": ["introduction"], "HSR": ["reading-2"], "SSR": ["reading-2"], "HFR": ["reading-1"], "SFR": ["reading-1"], "HRECM": ["withdrawal", "referral-committee"], "SRECM": ["withdrawal", "referral-committee"], "SW&C": ["withdrawal", "referral-committee"], "HW&C": ["withdrawal", "referral-committee"], "HRA": ["passage"], "SRA": ["passage"], "HPA": ["passage"], "HRECO": None, "SPA": ["passage"], "HTABL": None, # 'House Tabled' - what is this? "SDHAS": None, "HCFR": ["committee-passage-favorable"], "SCFR": ["committee-passage-favorable"], "HRAR": ["referral-committee"], "SRAR": ["referral-committee"], "STR": ["reading-3"], "SAHAS": None, "SE": ["passage"], "SR": ["referral-committee"], "HTRL": ["reading-3", "failure"], "HTR": ["reading-3"], "S3RLT": ["reading-3", "failure"], "HASAS": None, "S3RPP": None, "STAB": None, "SRECO": None, "SAPPT": None, "HCA": None, "HNOM": None, "HTT": None, "STT": None, "SRECP": None, "SCRA": None, "SNOM": None, "S2R": ["reading-2"], "H2R": ["reading-2"], "SENG": ["passage"], "HENG": ["passage"], "HPOST": None, "HCAP": None, "SDSG": ["executive-signature"], "SSG": ["executive-receipt"], "Signed Gov": ["executive-signature"], "HDSG": ["executive-signature"], "HSG": ["executive-receipt"], "EFF": None, "HRP": None, "STH": None, "HTS": None, } if not session: session = self.latest_session() self.info("no session specified, using %s", session) sid = SESSION_SITE_IDS[session] legislation = backoff(self.lservice.GetLegislationForSession, sid)["LegislationIndex"] for leg in legislation: lid = leg["Id"] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument["StatusHistory"][0]] actions = reversed([{ "code": x["Code"], "action": x["Description"], "_guid": x["Id"], "date": x["Date"], } for x in history]) guid = instrument["Id"] # A little bit hacky. bill_prefix = instrument["DocumentType"] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = "%s %s" % (bill_prefix, instrument["Number"]) if instrument["Suffix"]: bill_id += instrument["Suffix"] title = instrument["Caption"] description = instrument["Summary"] if title is None: continue bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type, ) bill.add_abstract(description, note="description") bill.extras = {"guid": guid} if instrument["Votes"]: for vote_ in instrument["Votes"]: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"]) vote = VoteEvent( start_date=vote_["Date"].strftime("%Y-%m-%d"), motion_text=vote_["Caption"] or "Vote on Bill", chamber={ "House": "lower", "Senate": "upper" }[vote_["Branch"]], result="pass" if vote_["Yeas"] > vote_["Nays"] else "fail", classification="passage", bill=bill, ) vote.set_count("yes", vote_["Yeas"]) vote.set_count("no", vote_["Nays"]) vote.set_count("other", vote_["Excused"] + vote_["NotVoting"]) vote.add_source(self.vsource) methods = {"Yea": "yes", "Nay": "no"} for vdetail in vote_["Votes"][0]: whom = vdetail["Member"] how = vdetail["MemberVoted"] if whom["Name"] == "VACANT": continue name, district = vote_name_pattern.search( whom["Name"]).groups() vote.vote(methods.get(how, "other"), name, note=district) yield vote ccommittees = defaultdict(list) committees = instrument["Committees"] if committees: for committee in committees[0]: ccommittees[{ "House": "lower", "Senate": "upper" }[committee["Type"]]].append(committee["Name"]) for action in actions: action_chamber = chamber_map[action["code"][0]] try: action_types = action_code_map[action["code"]] except KeyError: error_msg = "Code {code} for action {action} not recognized.".format( code=action["code"], action=action["action"]) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any( ("committee" in x for x in action_types)): committees = [ str(x) for x in ccommittees.get(action_chamber, []) ] act = bill.add_action( action["action"], action["date"].strftime("%Y-%m-%d"), classification=action_types, chamber=action_chamber, ) for committee in committees: act.add_related_entity(committee, "organization") act.extras = {"code": action["code"], "guid": action["_guid"]} sponsors = [] if instrument["Authors"]: sponsors = instrument["Authors"]["Sponsorship"] if "Sponsors" in instrument and instrument["Sponsors"]: sponsors += instrument["Sponsors"]["Sponsorship"] sponsors = [(x["Type"], self.get_member(x["MemberId"])) for x in sponsors] for typ, sponsor in sponsors: name = "{First} {Last}".format(**dict(sponsor["Name"])) bill.add_sponsorship( name, entity_type="person", classification="primary" if "Author" in typ else "secondary", primary="Author" in typ, ) for version in instrument["Versions"]["DocumentDescription"]: name, url, doc_id, version_id = [ version[x] for x in ["Description", "Url", "Id", "Version"] ] link = bill.add_version_link(name, url, media_type="application/pdf") link["extras"] = { "_internal_document_id": doc_id, "_version_id": version_id, } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source( SOURCE_URL.format(**{ "session": session, "bid": guid })) yield bill
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) except scrapelib.HTTPError as e: assert ( "500" in e.args[0] ), "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall(r"DocNum=(\d+)", url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath( '//span[text()="Short Description:"]/following-sibling::span[1]/' "text()" )[0].strip() summary = doc.xpath( '//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/' "text()" )[0].strip() bill = Bill( identifier=bill_id, legislative_session=session, title=title, classification=bill_type, chamber=chamber, ) bill.add_abstract(summary, note="") bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data committee_actors = {} # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action_elem in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") date = self.localize(date).date() actor = actor.text_content() if actor == "House": actor_id = {"classification": "lower"} elif actor == "Senate": actor_id = {"classification": "upper"} action = action_elem.text_content() classification, related_orgs = _categorize_action(action) # if related_orgs and any(c.startswith("committee") for c in classification): # ((name, source),) = [ # (a.text, a.get("href")) # for a in action_elem.xpath("a") # if "committee" in a.get("href") # ] # source = canonicalize_url(source) # actor_id = {"sources__url": source, "classification": "committee"} # committee_actors[source] = name bill.add_action( action, date, organization=actor_id, classification=classification, related_entities=related_orgs, ) if action.lower().find("sponsor") != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if official_type == "primary": primary = True else: primary = False if chamber: bill.add_sponsorship( sponsor, spontype, "person", primary=primary, chamber=chamber ) else: bill.add_sponsorship(spontype, sponsor, "person", primary=primary) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) yield bill votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] yield from self.scrape_votes(session, bill, votes_url, committee_actors)
def scrape_bill(self, row, chamber, session): bill_id = row['LegislationDisplayCode'] # hack for empty StatusName statusless_bills = ['HA 2 to SS 1 for SB 5', 'HA 3 to SS 1 for SB 5'] is_force_substitute = bill_id in statusless_bills \ and row['StatusName'] is None is_substituted = is_force_substitute or 'Substituted' in row['StatusName'] \ if is_substituted: # skip substituted bills, the replacement is picked up instead self.warning('skipping %s: %s', bill_id, row['StatusName']) return substitute = None if bill_id.count(' ') > 1: if 'w/' in bill_id or 'SA' in bill_id or 'HA' in bill_id: # TODO: re-evaluate if these should be separate bills self.warning('skipping amendment %s', bill_id) return elif ' for ' in bill_id: self.info( "Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(' for ') else: raise ValueError('unknown bill_id format: ' + bill_id) bill_type = self.classify_bill(bill_id) bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') if substitute: bill.extras['substitute'] = substitute # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId']) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a' additional_sponsors = html.xpath( '//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace( 'https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a' cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace( 'https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath( '//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' # on_duplicate='error' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) except scrapelib.HTTPError as e: assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/' 'text()')[0].strip() summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/' 'text()')[0].strip() bill = Bill(identifier=bill_id, legislative_session=session, title=title, classification=bill_type, chamber=chamber) bill.add_abstract(summary, note='') bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action_elem in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") date = self.localize(date).date() actor = actor.text_content() if actor == 'House': actor_id = {'classification': 'lower'} elif actor == 'Senate': actor_id = {'classification': 'upper'} action = action_elem.text_content() classification, related_orgs = _categorize_action(action) if (related_orgs and any(c.startswith('committee') for c in classification)): source, = [a.get('href') for a in action_elem.xpath('a') if 'committee' in a.get('href')] actor_id = {'sources__url': canonicalize_url(source), 'classification': 'committee'} bill.add_action(action, date, organization=actor_id, classification=classification, related_entities=related_orgs) if action.lower().find('sponsor') != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if official_type == 'primary': primary = True else: primary = False if chamber: bill.add_sponsorship(sponsor, spontype, 'person', primary=primary, chamber=chamber) else: bill.add_sponsorship(spontype, sponsor, 'person', primary=primary) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] votes = self.scrape_votes(session, bill, votes_url) return bill, votes
def scrape(self, session=None, chamber=None): bill_type_map = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', } chamber_map = { 'H': 'lower', 'S': 'upper', 'J': 'joint', 'E': 'legislature', # Effective date } action_code_map = { 'HI': None, 'SI': None, 'HH': None, 'SH': None, 'HPF': ['introduction'], 'HDSAS': None, 'SPF': ['introduction'], 'HSR': ['reading-2'], 'SSR': ['reading-2'], 'HFR': ['reading-1'], 'SFR': ['reading-1'], 'HRECM': ['withdrawal', 'referral-committee'], 'SRECM': ['withdrawal', 'referral-committee'], 'SW&C': ['withdrawal', 'referral-committee'], 'HW&C': ['withdrawal', 'referral-committee'], 'HRA': ['passage'], 'SRA': ['passage'], 'HPA': ['passage'], 'HRECO': None, 'SPA': ['passage'], 'HTABL': None, # 'House Tabled' - what is this? 'SDHAS': None, 'HCFR': ['committee-passage-favorable'], 'SCFR': ['committee-passage-favorable'], 'HRAR': ['referral-committee'], 'SRAR': ['referral-committee'], 'STR': ['reading-3'], 'SAHAS': None, 'SE': ['passage'], 'SR': ['referral-committee'], 'HTRL': ['reading-3', 'failure'], 'HTR': ['reading-3'], 'S3RLT': ['reading-3', 'failure'], 'HASAS': None, 'S3RPP': None, 'STAB': None, 'SRECO': None, 'SAPPT': None, 'HCA': None, 'HNOM': None, 'HTT': None, 'STT': None, 'SRECP': None, 'SCRA': None, 'SNOM': None, 'S2R': ['reading-2'], 'H2R': ['reading-2'], 'SENG': ['passage'], 'HENG': ['passage'], 'HPOST': None, 'HCAP': None, 'SDSG': ['executive-signature'], 'SSG': ['executive-receipt'], 'Signed Gov': ['executive-signature'], 'HDSG': ['executive-signature'], 'HSG': ['executive-receipt'], 'EFF': None, 'HRP': None, 'STH': None, 'HTS': None, } if not session: session = self.latest_session() self.info('no session specified, using %s', session) sid = SESSION_SITE_IDS[session] legislation = backoff( self.lservice.GetLegislationForSession, sid )['LegislationIndex'] for leg in legislation: lid = leg['Id'] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument['StatusHistory'][0]] actions = reversed([{ 'code': x['Code'], 'action': x['Description'], '_guid': x['Id'], 'date': x['Date'] } for x in history]) guid = instrument['Id'] # A little bit hacky. bill_prefix = instrument['DocumentType'] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = '%s %s' % ( bill_prefix, instrument['Number'], ) if instrument['Suffix']: bill_id += instrument['Suffix'] title = instrument['Caption'] description = instrument['Summary'] if title is None: continue bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type) bill.add_abstract(description, note='description') bill.extras = {'guid': guid} if instrument['Votes']: for vote_ in instrument['Votes']: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId']) vote = VoteEvent( start_date=vote_['Date'].strftime('%Y-%m-%d'), motion_text=vote_['Caption'] or 'Vote on Bill', chamber={'House': 'lower', 'Senate': 'upper'}[vote_['Branch']], result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', vote_['Yeas']) vote.set_count('no', vote_['Nays']) vote.set_count('other', vote_['Excused'] + vote_['NotVoting']) vote.add_source(self.vsource) methods = {'Yea': 'yes', 'Nay': 'no'} for vdetail in vote_['Votes'][0]: whom = vdetail['Member'] how = vdetail['MemberVoted'] vote.vote(methods.get(how, 'other'), whom['Name']) yield vote ccommittees = defaultdict(list) committees = instrument['Committees'] if committees: for committee in committees[0]: ccommittees[{ 'House': 'lower', 'Senate': 'upper', }[committee['Type']]].append(committee['Name']) for action in actions: action_chamber = chamber_map[action['code'][0]] try: action_types = action_code_map[action['code']] except KeyError: error_msg = 'Code {code} for action {action} not recognized.'.format( code=action['code'], action=action['action']) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any(('committee' in x for x in action_types)): committees = [str(x) for x in ccommittees.get( action_chamber, [])] act = bill.add_action( action['action'], action['date'].strftime('%Y-%m-%d'), classification=action_types, chamber=action_chamber) for committee in committees: act.add_related_entity(committee, 'organization') act.extras = { 'code': action['code'], 'guid': action['_guid'], } sponsors = [] if instrument['Authors']: sponsors = instrument['Authors']['Sponsorship'] if 'Sponsors' in instrument and instrument['Sponsors']: sponsors += instrument['Sponsors']['Sponsorship'] sponsors = [ (x['Type'], self.get_member(x['MemberId'])) for x in sponsors ] for typ, sponsor in sponsors: name = '{First} {Last}'.format(**dict(sponsor['Name'])) bill.add_sponsorship( name, entity_type='person', classification='primary' if 'Author' in typ else 'secondary', primary='Author' in typ, ) for version in instrument['Versions']['DocumentDescription']: name, url, doc_id, version_id = [ version[x] for x in [ 'Description', 'Url', 'Id', 'Version' ] ] # link = bill.add_version_link( # name, url, media_type='application/pdf') # link['extras'] = { # '_internal_document_id': doc_id, # '_version_id': version_id # } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source(SOURCE_URL.format(**{ 'session': session, 'bid': guid, })) yield bill
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = "https://malegislature.gov/Bills/{}/{}".format( session_for_url, bill_id ) try: response = self.get(bill_url) self.info("GET (with `requests`) - {}".format(bill_url)) except requests.exceptions.RequestException: self.warning(u"Server Error on {}".format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning(u"Server Error on {}".format(bill_url)) return False # The state website will periodically miss a few bills' titles for a few days # These titles will be extant on the bill list page, but missing on the bill detail page # The titles are eventually populated try: bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[ 0 ] except IndexError: self.warning("Couldn't find title for {}; skipping".format(bill_id)) return False bill_types = ["H", "HD", "S", "SD", "SRes"] if re.sub("[0-9]", "", bill_id) not in bill_types: self.warning("Unsupported bill type for {}; skipping".format(bill_id)) return False if "SRes" in bill_id: bill_id = bill_id.replace("SRes", "SR") bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification="bill", ) bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, "summary") bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' "following-sibling::dd/descendant-or-self::*/text()[normalize-space()]" ) if sponsor: sponsor = sponsor[0].strip() bill.add_sponsorship( sponsor, classification="primary", primary=True, entity_type="person" ) self.scrape_cosponsors(bill, bill_url) version = page.xpath( "//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href" ) if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link( "Bill Text", version_url, media_type="application/pdf" ) # yield back votes and bill # XXX yield from self.scrape_actions(bill, bill_url, session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) self._bill_prefix_map = { "HB": { "type": "bill", "url_segment": "bills/house" }, "HR": { "type": "resolution", "url_segment": "resolutions/house/simple" }, "HCR": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJR": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "HC": { "type": "concurrent resolution", "url_segment": "resolutions/house/concurrent", }, "HJ": { "type": "joint resolution", "url_segment": "resolutions/house/joint", }, "SB": { "type": "bill", "url_segment": "bills/senate" }, "SR": { "type": "resolution", "url_segment": "resolutions/senate/simple" }, "SCR": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJR": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, "SC": { "type": "concurrent resolution", "url_segment": "resolutions/senate/concurrent", }, "SJ": { "type": "joint resolution", "url_segment": "resolutions/senate/joint", }, } api_base_url = "https://api.iga.in.gov" proxy = {"url": "http://in-proxy.openstates.org"} # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # sunlight's put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue disp_bill_id = bill_id[:idx] + " " + str(int(bill_id[idx:])) break bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue title = bill_json["description"] if title == "NoneNone": title = None # sometimes description is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning( "Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ("lower" if bill_json["originChamber"].lower() == "house" else "upper") bill_type = self._bill_prefix_map[bill_prefix]["type"] bill = Bill( disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type, ) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: bill.add_sponsorship( classification="author", name=self._get_name(s), entity_type="person", primary=True, ) for s in bill_json["coauthors"]: bill.add_sponsorship( classification="coauthor", name=self._get_name(s), entity_type="person", primary=False, ) for s in bill_json["sponsors"]: bill.add_sponsorship( classification="sponsor", name=self._get_name(s), entity_type="person", primary=True, ) for s in bill_json["cosponsors"]: bill.add_sponsorship( classification="cosponsor", name=self._get_name(s), entity_type="person", primary=False, ) # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get("bill_actions", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace("T", " ") # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if "second reading" in d or "reread second time" in d: action_type.append("reading-2") reading = True if "third reading" in d or "reread third time" in d: action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ("referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning( "Could not recognize an action in '{}'".format( action_desc)) action_type = None a = bill.add_action( chamber=action_chamber, description=action_desc, date=date, classification=action_type, ) if committee: a.add_related_entity(committee, entity_type="organization") # subjects subjects = [ s["entry"] for s in bill_json["latestVersion"]["subjects"] ] for subject in subjects: bill.add_subject(subject) # Abstract if bill_json["latestVersion"]["digest"]: bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest") # versions and votes for version in bill_json["versions"][::-1]: try: version_json = client.get( "bill_version", session=session, bill_id=version["billName"], version_id=version["printVersionName"], ) except scrapelib.HTTPError: self.logger.warning("Bill version does not seem to exist.") continue yield from self.deal_with_version(version_json, bill, bill_id, original_chamber, session, proxy) yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=year, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) yield bill
def scrape_matter(self, matter_link, sess): matter_types = { "Additions": "other", "Administrative Order": "order", "Annual Evaluation": "other", "Bid Advertisement": "other", "Bid Awards": "other", "Bid Contract": "contract", "Bid Protest": "other", "Bid Rejection": "other", "Birthday Scroll": "commemoration", "Certificate of Appreciation": "commemoration", "Change Order": "order", "Citizen's Presentation": "other", "Commendation": "commemoration", "Conflict Waiver": "other", "Congratulatory Certificate": "commemoration", "Deferrals": "other", "Discussion Item": "other", "Distinguished Visitor": "other", "Joint Meeting/Workshop": "other", "Mayoral Veto": "other", "Miscellaneous": "other", "Nomination": "nomination", "Oath of Office": "other", "Omnibus Reserve": "bill", "Ordinance": "ordinance", "Plaque": "commemoration", "Presentation": "other", "Proclamation": "proclamation", "Professional Service Agreement": "contract", "Public Hearing": "other", "Report": "other", "Request for Proposals": "other", "Request for Qualifications": "other", "Request to Advertise": "other", "Resolution": "resolution", "Resolution of Sympathy": "resolution", "Service Awards": "commemoration", "Special Item": "other", "Special Presentation": "other", "Supplement": "other", "Swearing-In": "other", "Time Sensitive Items": "other", "Withdrawals": "other", "Workshop Item": "other", "Zoning": "other", "Zoning Resolution": "resolution" } matter_doc = self.lxmlize(matter_link) info_dict = self.matter_table_to_dict(matter_doc) #we're going to use the year of the intro date as the session #until/unless we come up with something better intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y") session = sess["identifier"] category = matter_types[info_dict["File Type"]] if 'File Name' in info_dict: title = info_dict["File Name"] elif "Title" in info_dict and info_dict["Title"].strip(): title = info_dict["Title"].strip() else: self.warning("bill has no title") return if category == 'other': bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title) else: bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title, classification=category) for spons in info_dict["Sponsors"]: if spons == "NONE": continue try: name, spons_type = spons.rsplit(",", 1) except ValueError: name = spons spons_type = "Sponsor" primary = True if "Prime Sponsor" in spons_type else False entity = "person" if "committee" in name: entity = committee bill.add_sponsorship(name, spons_type, entity, primary) if "Indexes" in info_dict: for subj in info_dict["Indexes"]: if subj.strip() and subj.strip() != "NONE": bill.add_subject(subj.strip()) if "Title" in info_dict and info_dict["Title"].strip(): note = "bill's long title'" if ("Note" in info_dict and info_dict["Note"].strip()): note = info_dict["Note"] bill.add_abstract(abstract=info_dict["Title"], note=note) self.process_action_table(matter_doc, bill) bill.add_source(matter_link, note='web') yield bill
def scrape(self): for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) : leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name":"New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details : bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number'] : bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) : sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id = make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []) : bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history : earliest_action = min(self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else : bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history : action_description = action['Action'] if not action_description : continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'New York City Council' elif responsible_org == 'Administration' : responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting' : continue else : act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral' : action_details = self.actionDetails(action_detail_url) referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity(referred_committee, 'organization', entity_id = make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes : action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text : bill.extras = {'local_classification' : leg_summary['Type'], 'full_text' : text} else : bill.extras = {'local_classification' : leg_summary['Type']} yield bill
def scrape(self): for agenda_item in self.agendaItems(date_from=self.start_date, date_to=self.end_date): # TODO: Add agenda_item type to OCD leg_type = "bill" title = agenda_item["Title"].replace("\n", " ") title_re = re.compile( "^(.+?)(?: - (?:by )?((?:Deputy )?Mayor|Councillor) (.+), seconded by ((?:Deputy )?Mayor|Councillor) (.+))?$" ) title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(title_re, title).groups() b = Bill( identifier=agenda_item["Item No."], title=title, legislative_session=None, classification=leg_type, from_organization={"name": self.jurisdiction.name}, ) b.add_source(agenda_item["url"], note="web") if primary_sponsor and secondary_sponsor: b.add_sponsorship(primary_sponsor, "mover", "person", True) b.add_sponsorship(secondary_sponsor, "seconder", "person", False) # TODO: Fake session for now b.legislative_session = "2014-2018" agenda_item_versions = self.agendaItemVersions(agenda_item["url"]) # Use one version's full_text (will be most recent) b.extras["full_text"] = agenda_item_versions[0]["full_text"] for version in agenda_item_versions: action_date = self.toDate(version["date"]) if "Summary" in version["sections"]: # TODO: Investigate whether these vary between versions, as # we perhaps don't need to add one for each b.add_abstract(version["sections"]["Summary"], note="", date=action_date) if not version["action"]: continue if re.match(r"\d+:\d+ [A|P]M", version["action"]): continue action_description = version["action"] responsible_org = version["responsible_org"] action_class = ACTION_CLASSIFICATION.get(version["action"]) def is_recommendation(version): return any("Recommendations" in s for s in version["sections"].keys()) if responsible_org == "City Council": responsible_org = self.jurisdiction.name else: if action_class == "passage": action_class = "committee-passage" if is_recommendation(version): action_class = "committee-passage-favorable" b.add_action( action_description, action_date, organization={"name": responsible_org}, classification=action_class ) yield b
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+bill_type',None) old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if classification == ['address'] and self.state == 'nh': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor in ('upper','council') and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids', '+additional_information'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = VoteEvent(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == 'XXXXXX': self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) amendment_links = bill_page.xpath( '//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath('string(.)').strip() if 'adopted' in link_text.lower(): link_url = link.xpath('@href')[0] bill.add_version_link(link_text, link_url, media_type='application/pdf', on_duplicate='ignore') yield bill
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill(identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json['location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID']} if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link(note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link(note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link(note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link(note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link(note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization' bill.add_sponsorship( name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor'] ) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime( bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data['summary'], classification=bill_type, ) if bill_data['summary']: bill.add_abstract(bill_data['summary'], note='') bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor'] is not None: if bill_data['sponsor']['rules'] is True: bill.add_sponsorship( 'Rules Committee', entity_type='organization', classification='primary', primary=True, ) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsorship( primary_sponsor['shortName'], entity_type='person', classification='primary', primary=True, ) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor['shortName'], entity_type='person', classification='cosponsor', primary=False, ) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill( companion_bill_id, companion_bill_session, relation_type='companion', ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime( action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action['text']) bill.add_action( action['text'], action_date.strftime('%Y-%m-%d'), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources[ 'bill'].format( session_year=session, bill_id=bill_id, summary='', detail='') bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: yield self._parse_senate_votes(vote_data, bill, api_url) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) yield from assembly.build() # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.items(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}&Text=Y'.format(bill_id, self.term_start_year) bill.add_version_link( html_version, html_url, on_duplicate='ignore', media_type='text/html', ) pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version_link( pdf_version, pdf_url, on_duplicate='ignore', media_type='application/pdf', ) yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == 'XXXXXX': self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) amendment_links = bill_page.xpath('//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath('string(.)').strip() if 'adopted' in link_text.lower(): link_url = link.xpath('@href')[0] bill.add_version_link(link_text, link_url, media_type='application/pdf', on_duplicate='ignore') yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def _scrape_bills(self): """ Does the following 1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module 2) Iterates over bill data and converts each one to an OCD-compliant bill model. 3) Yields the OCD-compliant bill model instance @return: generator for federal US bills in OCD-compliant format @rtype: generator """ # run scraper first to pull in all the bill data self._run_unitedstates_bill_scraper() # iterate over all the files and build and yield Bill objects for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'): try: with open(filename) as json_file: json_data = json.load(json_file) # Initialize Object bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'], json_data['congress'], json_data['official_title'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'] ) # add source of data bill.add_source(json_data['url'], note='all') # add subjects for subject in json_data['subjects']: bill.add_subject(subject) # add summary if 'summary' in json_data and json_data['summary'] is not None: bill.add_abstract(json_data['summary']['text'], json_data['summary']['as'], json_data['summary']['date']) # add titles for item in json_data['titles']: bill.add_title(item['title'], item['type']) # add other/related Bills for b in json_data['related_bills']: if 'type' in b and b['type'] == 'bill': split = b['bill_id'].split('-') m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0]) bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2), legislative_session=split[1], relation_type='companion') # add sponsor bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True, scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add cosponsors for cs in json_data['cosponsors']: bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False, scheme='thomas_id', identifier=cs['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add introduced_at and actions bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']), chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'], related_entities=[]) # add other actions for action in json_data['actions']: bill.actions.append({'date': datetime_to_date(action['acted_at']), 'type': [action['type']], 'description': action['text'], 'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) # add bill versions for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR, 'data', bill.legislative_session, 'bills', json_data['bill_type'], json_data['bill_type'] + json_data['number'], 'text-versions'), '/.*/*\.json'): try: with open(version_path) as version_file: version_json_data = json.load(version_file) for k, v in version_json_data['urls'].items(): bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']), 'type': version_json_data['version_code'], 'name': constants.VERSION_MAP[version_json_data['version_code']], 'links': [{'mimetype': k, 'url': v}]}) except IOError: print("Unable to open or parse file with path " + version_path) continue # finally yield bill object yield bill except IOError: print("Unable to open file with path " + filename) print(traceback.format_exc()) continue except KeyError: print("Unable to parse file with path " + filename) print(traceback.format_exc()) continue except: print('Unknown error with ' + filename) print(traceback.format_exc()) continue
def test_full_bill(): create_jurisdiction() sp = ScrapePerson('Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=sp._id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", date='1969-10-20') bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp.as_dict()]) BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' assert b.abstracts.get().date == '1969-10-20' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name='Adam Smith') for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape_bills(self, session): session_key = SESSION_KEYS[session] measures_response = self.api_client.get("measures", page=500, session=session_key) legislators = index_legislators(self, session_key) for measure in measures_response: bid = "{} {}".format(measure["MeasurePrefix"], measure["MeasureNumber"]) chamber = self.chamber_code[bid[0]] bill = Bill( bid.replace(" ", ""), legislative_session=session, chamber=chamber, title=measure["RelatingTo"], classification=self.bill_types[measure["MeasurePrefix"][1:]], ) bill.add_abstract(measure["MeasureSummary"].strip(), note="summary") for sponsor in measure["MeasureSponsors"]: legislator_code = sponsor["LegislatoreCode"] # typo in API if legislator_code: try: legislator = legislators[legislator_code] except KeyError: logger.warn( "Legislator {} not found in session {}".format( legislator_code, session)) legislator = legislator_code bill.add_sponsorship( name=legislator, classification={ "Chief": "primary", "Regular": "cosponsor" }[sponsor["SponsorLevel"]], entity_type="person", primary=True if sponsor["SponsorLevel"] == "Chief" else False, ) bill.add_source( "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}" .format(session=session_key, bid=bid.replace(" ", ""))) for document in measure["MeasureDocuments"]: # TODO: probably mixing documents & versions here - should revisit try: bill.add_version_link( document["VersionDescription"], document["DocumentUrl"], media_type="application/pdf", ) except ValueError: logger.warn("Duplicate link found for {}".format( document["DocumentUrl"])) for agenda_item in measure["CommitteeAgendaItems"]: for document in agenda_item["CommitteeProposedAmendments"]: if "adopted" in document["Meaning"].lower(): amd_name = "{} Amendment {}".format( document["CommitteeCode"], document["AmendmentNumber"]) bill.add_version_link( amd_name, document["ProposedAmendmentUrl"], media_type="application/pdf", on_duplicate="ignore", ) for action in measure["MeasureHistoryActions"]: classifiers = self.determine_action_classifiers( action["ActionText"]) when = datetime.datetime.strptime(action["ActionDate"], "%Y-%m-%dT%H:%M:%S") when = self.tz.localize(when) bill.add_action( action["ActionText"], when, chamber=self.chamber_code[action["Chamber"]], classification=classifiers, ) yield bill
def scrape_bill(self, row, session): bill_id = row['LegislationDisplayCode'] amendment = None substitute = None if bill_id.count(' ') > 1: if ' w/ ' in bill_id: self.info('Found amended bill `{}`'.format(bill_id)) bill_id, amendment = bill_id.split(' w/ ') # A bill can _both_ be amended and be substituted if ' for ' in bill_id: self.info("Found substitute to use instead: `{}`".format(bill_id)) substitute, bill_id = bill_id.split(' for ') if amendment is None and substitute is None: raise ValueError('unknown bill_id format: ' + bill_id) bill_type = self.classify_bill(bill_id) chamber = 'upper' if bill_id.startswith('S') else 'lower' bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') if substitute: bill.extras['substitute'] = substitute if amendment: bill.extras['amendment'] = amendment # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) if row['HasAmendments'] is True: self.scrape_amendments(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def scrape(self, session=None, chamber=None): bill_type_map = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', } chamber_map = { 'H': 'lower', 'S': 'upper', 'J': 'joint', 'E': 'legislature', # Effective date } action_code_map = { 'HI': None, 'SI': None, 'HH': None, 'SH': None, 'HPF': ['introduction'], 'HDSAS': None, 'SPF': ['introduction'], 'HSR': ['reading-2'], 'SSR': ['reading-2'], 'HFR': ['reading-1'], 'SFR': ['reading-1'], 'HRECM': ['withdrawal', 'referral-committee'], 'SRECM': ['withdrawal', 'referral-committee'], 'SW&C': ['withdrawal', 'referral-committee'], 'HW&C': ['withdrawal', 'referral-committee'], 'HRA': ['passage'], 'SRA': ['passage'], 'HPA': ['passage'], 'HRECO': None, 'SPA': ['passage'], 'HTABL': None, # 'House Tabled' - what is this? 'SDHAS': None, 'HCFR': ['committee-passage-favorable'], 'SCFR': ['committee-passage-favorable'], 'HRAR': ['referral-committee'], 'SRAR': ['referral-committee'], 'STR': ['reading-3'], 'SAHAS': None, 'SE': ['passage'], 'SR': ['referral-committee'], 'HTRL': ['reading-3', 'failure'], 'HTR': ['reading-3'], 'S3RLT': ['reading-3', 'failure'], 'HASAS': None, 'S3RPP': None, 'STAB': None, 'SRECO': None, 'SAPPT': None, 'HCA': None, 'HNOM': None, 'HTT': None, 'STT': None, 'SRECP': None, 'SCRA': None, 'SNOM': None, 'S2R': ['reading-2'], 'H2R': ['reading-2'], 'SENG': ['passage'], 'HENG': ['passage'], 'HPOST': None, 'HCAP': None, 'SDSG': ['executive-signature'], 'SSG': ['executive-receipt'], 'Signed Gov': ['executive-signature'], 'HDSG': ['executive-signature'], 'HSG': ['executive-receipt'], 'EFF': None, 'HRP': None, 'STH': None, 'HTS': None, } if not session: session = self.latest_session() self.info('no session specified, using %s', session) sid = SESSION_SITE_IDS[session] legislation = backoff(self.lservice.GetLegislationForSession, sid)['LegislationIndex'] for leg in legislation: lid = leg['Id'] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument['StatusHistory'][0]] actions = reversed([{ 'code': x['Code'], 'action': x['Description'], '_guid': x['Id'], 'date': x['Date'] } for x in history]) guid = instrument['Id'] # A little bit hacky. bill_prefix = instrument['DocumentType'] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = '%s %s' % ( bill_prefix, instrument['Number'], ) if instrument['Suffix']: bill_id += instrument['Suffix'] title = instrument['Caption'] description = instrument['Summary'] if title is None: continue bill = Bill(bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type) bill.add_abstract(description, note='description') bill.extras = {'guid': guid} if instrument['Votes']: for vote_ in instrument['Votes']: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId']) vote = VoteEvent( start_date=vote_['Date'].strftime('%Y-%m-%d'), motion_text=vote_['Caption'] or 'Vote on Bill', chamber={ 'House': 'lower', 'Senate': 'upper' }[vote_['Branch']], result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', vote_['Yeas']) vote.set_count('no', vote_['Nays']) vote.set_count('other', vote_['Excused'] + vote_['NotVoting']) vote.add_source(self.vsource) methods = {'Yea': 'yes', 'Nay': 'no'} for vdetail in vote_['Votes'][0]: whom = vdetail['Member'] how = vdetail['MemberVoted'] vote.vote(methods.get(how, 'other'), whom['Name']) yield vote ccommittees = defaultdict(list) committees = instrument['Committees'] if committees: for committee in committees[0]: ccommittees[{ 'House': 'lower', 'Senate': 'upper', }[committee['Type']]].append(committee['Name']) for action in actions: action_chamber = chamber_map[action['code'][0]] try: action_types = action_code_map[action['code']] except KeyError: error_msg = 'Code {code} for action {action} not recognized.'.format( code=action['code'], action=action['action']) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any( ('committee' in x for x in action_types)): committees = [ str(x) for x in ccommittees.get(action_chamber, []) ] act = bill.add_action(action['action'], action['date'].strftime('%Y-%m-%d'), classification=action_types, chamber=action_chamber) for committee in committees: act.add_related_entity(committee, 'organization') act.extras = { 'code': action['code'], 'guid': action['_guid'], } sponsors = [] if instrument['Authors']: sponsors = instrument['Authors']['Sponsorship'] if 'Sponsors' in instrument and instrument['Sponsors']: sponsors += instrument['Sponsors']['Sponsorship'] sponsors = [(x['Type'], self.get_member(x['MemberId'])) for x in sponsors] for typ, sponsor in sponsors: name = '{First} {Last}'.format(**dict(sponsor['Name'])) bill.add_sponsorship( name, entity_type='person', classification='primary' if 'Author' in typ else 'secondary', primary='Author' in typ, ) for version in instrument['Versions']['DocumentDescription']: name, url, doc_id, version_id = [ version[x] for x in ['Description', 'Url', 'Id', 'Version'] ] link = bill.add_version_link(name, url, media_type='application/pdf') link['extras'] = { '_internal_document_id': doc_id, '_version_id': version_id } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source( SOURCE_URL.format(**{ 'session': session, 'bid': guid, })) yield bill
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill( identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json[ 'location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID'] } if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link( note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor[ 'sponsorTitle'] else 'organization' bill.add_sponsorship(name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor']) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime(bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data['summary'], classification=bill_type, ) if bill_data['summary']: bill.add_abstract(bill_data['summary'], note='') bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor'] is not None: if bill_data['sponsor']['rules'] is True: bill.add_sponsorship( 'Rules Committee', entity_type='organization', classification='primary', primary=True, ) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsorship( primary_sponsor['shortName'], entity_type='person', classification='primary', primary=True, ) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor['shortName'], entity_type='person', classification='cosponsor', primary=False, ) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill( companion_bill_id, companion_bill_session, relation_type='companion', ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action['text']) bill.add_action( action['text'], action_date.strftime('%Y-%m-%d'), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='') bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: yield self._parse_senate_votes(vote_data, bill, api_url) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.items(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version_link( html_version, html_url, on_duplicate='ignore', media_type='text/html', ) pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version_link( pdf_version, pdf_url, on_duplicate='ignore', media_type='application/pdf', ) yield bill
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) except scrapelib.HTTPError as e: assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall(r'DocNum=(\d+)', url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/' 'text()')[0].strip() summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/' 'text()')[0].strip() bill = Bill(identifier=bill_id, legislative_session=session, title=title, classification=bill_type, chamber=chamber) bill.add_abstract(summary, note='') bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data committee_actors = {} # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action_elem in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") date = self.localize(date).date() actor = actor.text_content() if actor == 'House': actor_id = {'classification': 'lower'} elif actor == 'Senate': actor_id = {'classification': 'upper'} action = action_elem.text_content() classification, related_orgs = _categorize_action(action) if (related_orgs and any(c.startswith('committee') for c in classification)): (name, source), = [(a.text, a.get('href')) for a in action_elem.xpath('a') if 'committee' in a.get('href')] source = canonicalize_url(source) actor_id = {'sources__url': source, 'classification': 'committee'} committee_actors[source] = name bill.add_action(action, date, organization=actor_id, classification=classification, related_entities=related_orgs) if action.lower().find('sponsor') != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if official_type == 'primary': primary = True else: primary = False if chamber: bill.add_sponsorship(sponsor, spontype, 'person', primary=primary, chamber=chamber) else: bill.add_sponsorship(spontype, sponsor, 'person', primary=primary) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) yield bill votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] yield from self.scrape_votes(session, bill, votes_url, committee_actors)
def scrape_bill(self, bill_num, session): chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = ("http://wyoleg.gov/LsoService/api/BillInformation/{}/" "{}?calendarDate=".format(session, bill_num)) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode("utf-8")) chamber = "lower" if bill_json["bill"][0] else "upper" bill = Bill( identifier=bill_json["bill"], legislative_session=session, title=bill_json["catchTitle"], chamber=chamber, classification="bill", ) bill.add_title(bill_json["billTitle"]) source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format( session, bill_json["bill"]) bill.add_source(source_url) for action_json in bill_json["billActions"]: utc_action_date = self.parse_local_date(action_json["statusDate"]) actor = None if action_json["location"] and action_json[ "location"] in chamber_map: actor = chamber_map[action_json["location"]] action = bill.add_action( chamber=actor, description=action_json["statusMessage"], date=utc_action_date, classification=categorize_action(action_json["statusMessage"]), ) action.extras = { "billInformationID": action_json["billInformationID"] } if bill_json["introduced"]: url = "http://wyoleg.gov/{}".format(bill_json["introduced"]) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["enrolledAct"]: url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"]) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["fiscalNote"]: url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"]) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["digest"]: url = "http://wyoleg.gov/{}".format(bill_json["digest"]) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["vetoes"]: for veto in bill_json["vetoes"]: url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"]) bill.add_version_link( note=veto["vetoLinkText"], url=url, media_type="application/pdf", # optional but useful! ) for amendment in bill_json["amendments"]: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = "http://wyoleg.gov/{}/Amends/{}.pdf".format( session, amendment["amendmentNumber"]) if amendment["sponsor"] and amendment["status"]: title = "Amendment {} ({}) - {} ({})".format( amendment["amendmentNumber"], amendment["order"], amendment["sponsor"], amendment["status"], ) else: title = "Amendment {} ({})".format( amendment["amendmentNumber"], amendment["order"]) # add versions of the bill text version = bill.add_version_link(note=title, url=url, media_type="application/pdf") version["extras"] = { "amendmentNumber": amendment["amendmentNumber"], "sponsor": amendment["sponsor"], } for sponsor in bill_json["sponsors"]: status = "primary" if sponsor["primarySponsor"] else "cosponsor" sponsor_type = "person" if sponsor[ "sponsorTitle"] else "organization" bill.add_sponsorship( name=sponsor["name"], classification=status, entity_type=sponsor_type, primary=sponsor["primarySponsor"], ) if bill_json["summary"]: bill.add_abstract(note="summary", abstract=bill_json["summary"]) if bill_json["enrolledNumber"]: bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"] if bill_json["chapter"]: bill.extras["chapter"] = bill_json["chapter"] if bill_json["effectiveDate"]: eff = datetime.datetime.strptime(bill_json["effectiveDate"], "%m/%d/%Y") bill.extras["effective_date"] = eff.strftime("%Y-%m-%d") bill.extras["wy_bill_id"] = bill_json["id"] for vote_json in bill_json["rollCalls"]: yield from self.scrape_vote(bill, vote_json, session) yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = [ 'Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York' ] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def scrape_bill(self, bill_url, bill_id, session_id): page = self.lxmlize(bill_url) # create bill title = page.xpath("//em/text()")[0] bill = Bill(identifier=bill_id, legislative_session=session_id, title=title) bill.add_source(bill_url, note="detail") # add additional fields # abstract try: # abstract is directly above <h2>Legislative History</h2> leg_his = page.xpath("//h2[text()='Legislative History']")[0] abstract = leg_his.xpath("preceding-sibling::p/text()")[0] bill.add_abstract(abstract=abstract.strip(), note="summary") # TODO trim whitespace from summary except IndexError: print("No abstract for bill {} in session {}".format( bill_id, session_id)) # the rest of the fields are found inside this <table> data_table = page.xpath("//table[@class='data vertical_table']")[0] # sponsor sponsor_name = data_table.xpath( self.bill_table_query("Sponsor") + "/text()")[0] bill.add_sponsorship(name=sponsor_name, classification="Primary", entity_type="person", primary=True) # actions action_lines = data_table.xpath( self.bill_table_query("Actions") + "/text()") for line in action_lines: try: for date_str, action_type in self.parse_actions(line): bill.add_action(date=date_str, description=action_type, classification=action_type) except ValueError: print("failed to parse these actions: {}".format([line])) # co-sponsors co_sponsors = data_table.xpath( self.bill_table_query("Co-Sponsors") + "/text()") co_sponsors = [name.strip() for name in co_sponsors if name.strip()] for name in co_sponsors: bill.add_sponsorship(name=name, classification="co-sponsor", entity_type="person", primary=False) # committee (stored as another sponsorship in OCD) committees = data_table.xpath( self.bill_table_query("Committee") + "/a/text()") for comm in committees: bill.add_sponsorship( name=comm, classification="secondary", # classification ? entity_type="organization", primary=False) return bill
def scrape_matter(self, matter_link, sess): matter_types = { "Additions":"other", "Administrative Order":"order", "Annual Evaluation":"other", "Bid Advertisement":"other", "Bid Awards":"other", "Bid Contract":"contract", "Bid Protest":"other", "Bid Rejection":"other", "Birthday Scroll":"commemoration", "Certificate of Appreciation":"commemoration", "Change Order":"order", "Citizen's Presentation":"other", "Commendation":"commemoration", "Conflict Waiver":"other", "Congratulatory Certificate":"commemoration", "Deferrals":"other", "Discussion Item":"other", "Distinguished Visitor":"other", "Joint Meeting/Workshop":"other", "Mayoral Veto":"other", "Miscellaneous":"other", "Nomination":"nomination", "Oath of Office":"other", "Omnibus Reserve":"bill", "Ordinance":"ordinance", "Plaque":"commemoration", "Presentation":"other", "Proclamation":"proclamation", "Professional Service Agreement":"contract", "Public Hearing":"other", "Report":"other", "Request for Proposals":"other", "Request for Qualifications":"other", "Request to Advertise":"other", "Resolution":"resolution", "Resolution of Sympathy":"resolution", "Service Awards":"commemoration", "Special Item":"other", "Special Presentation":"other", "Supplement":"other", "Swearing-In":"other", "Time Sensitive Items":"other", "Withdrawals":"other", "Workshop Item":"other", "Zoning":"other", "Zoning Resolution":"resolution" } matter_doc = self.lxmlize(matter_link) info_dict = self.matter_table_to_dict(matter_doc) #we're going to use the year of the intro date as the session #until/unless we come up with something better intro_date = datetime.strptime(info_dict["Introduced"],"%m/%d/%Y") session = sess["identifier"] category = matter_types[info_dict["File Type"]] if 'File Name' in info_dict: title = info_dict["File Name"] elif "Title" in info_dict and info_dict["Title"].strip(): title = info_dict["Title"].strip() else: self.warning("bill has no title") return if category == 'other': bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title ) else: bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title, classification=category ) for spons in info_dict["Sponsors"]: if spons == "NONE": continue try: name,spons_type = spons.rsplit(",",1) except ValueError: name = spons spons_type = "Sponsor" primary = True if "Prime Sponsor" in spons_type else False entity = "person" if "committee" in name: entity = committee bill.add_sponsorship(name,spons_type,entity,primary) if "Indexes" in info_dict: for subj in info_dict["Indexes"]: if subj.strip() and subj.strip() != "NONE": bill.add_subject(subj.strip()) if "Title" in info_dict and info_dict["Title"].strip(): note = "bill's long title'" if ("Note" in info_dict and info_dict["Note"].strip()): note = info_dict["Note"] bill.add_abstract(abstract=info_dict["Title"],note=note) self.process_action_table(matter_doc,bill) bill.add_source(matter_link, note='web') yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize(version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime( '%m/%d/%y') version_name = "{} - {}".format( version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity( committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ( 'http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}' ).format(fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id=_make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []): bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = ['Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York'] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') return bill