def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect('center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('center table center')[0] for row in center.cssselect('table')[-2].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table')[-1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] bill.add_sponsor('primary', author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content() bill.add_action(chamber, action, date) # also has an associated version if tds[1].xpath('a'): bill.add_version(action, tds[1].xpath('a/@href')[0]) bill.add_source(url) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect('table') # Bill name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[2].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[-1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect("table") # Bill name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version("Current", url.replace("/sum/", "/fulltext/")) # Sponsorships for a in tables[2].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions for row in tables[-1].cssselect("tr"): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if "/" not in senate_date and "/" not in house_date: continue if senate_date: bill.add_action("upper", action_text, senate_date) if house_date: bill.add_action("lower", action_text, house_date) self.save_bill(bill)
def _parse_bill(self, session, chamber, source_url, line): if line: (type, combined_id, number, title, relating_to) = line.split("\xe4") if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'): # # basic bill info bill_id = "%s %s" % (type, number.zfill(4)) bill = Bill(session, chamber, bill_id, title) bill.add_source(source_url) # # add actions if self.actionsByBill.has_key(bill_id): for a in self.actionsByBill[bill_id]: bill.add_action(a['actor'], a['action'], a['date']) if self.load_versions_sponsors: # add versions and sponsors versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id) #print "versionsSponsors: %s" % str(versionsSponsors) if versionsSponsors: for ver in versionsSponsors['versions']: bill.add_version(ver['name'], ver['url']) sponsorType = 'primary' if len(versionsSponsors['sponsors']) > 1: sponsorType = 'cosponsor' for name in versionsSponsors['sponsors']: bill.add_sponsor(sponsorType, name) # save - writes out JSON self.save_bill(bill)
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('#legislation h1')[0].text_content().strip() bill_id = name.split(' - ')[0].strip() bill = Bill(session, chamberName, bill_id, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor('', a.text_content().strip()) # Actions for row in page.cssselect('#history tr')[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue date = datetime.datetime.strptime(date, '%m/%d/%Y') if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in page.cssselect('#versions a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape_bill(self, chamber, bill): bill_id = bill['id'].replace('w/','with ') page = lxml.html.fromstring(self.urlopen(bill['url'])) page.make_links_absolute(bill['url']) title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0] # text_content() == make sure any tags in the title don't cause issues title = title_row.xpath('td[@width="79%"]/font')[0].text_content() # now we can create a bill object b = Bill(bill['session'], bill['chamber'], bill_id, title) b.add_source(bill['url']) sponsors_row = page.xpath('//tr[td/b[contains(font,"Primary Sponsor")]]')[0] sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text if sponsor != None: b.add_sponsor('primary', sponsor) # scraping these and co-sponsors, but not doing anything with them until # it's decided whether or not to attempt to split 'em up additional = sponsors_row.xpath('td[@width="48%"]/font') additional_sponsors = additional[0].text if len(additional) > 0 else "" additional_sponsors = additional_sponsors.replace('   ','') cosponsors_row = page.xpath('//tr[td/b[contains(font,"CoSponsors")]]')[0] cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text cosponsors = cosponsors if cosponsors != '{ NONE...}' else '' introduced_row = page.xpath('//tr[td/b[contains(font,"Introduced On")]]') if len(introduced_row) > 0: introduced = introduced_row[0].expath('/td[@width="31%"]/font')[0].text introduced = datetime.strptime(introduced, '%b %d, %Y') b.add_action(bill['chamber'], 'introduced', introduced, 'bill:introduced') actions = page.xpath('//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font') if len(actions) > 0: actions = actions[0].text_content().split('\n') for act in actions: act = act.partition(' - ') date = datetime.strptime(act[0], '%b %d, %Y') b.add_action(bill['chamber'], act[2], date) # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]') # save vote urls for scraping later vote_urls = [] voting_reports = page.xpath('//tr[td/b[contains(font, "Voting Reports")]]') if(len(voting_reports) > 0): for report in voting_reports[0].xpath('td/font/a'): vote_urls.append(report.attrib['href']) # Scrape votes for url in vote_urls: vote = self.scrape_votes(chamber, title, bill_id, url) b.add_vote(vote) # Save bill self.save_bill(b)
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: doc = self.lxmlize(url) except scrapelib.HTTPError as e: assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip() summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, summary=summary) bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() bill.add_action(actor, action, date, **_categorize_action(action)) if action.lower().find('sponsor') != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if chamber: bill.add_sponsor(spontype, sponsor, official_type=official_type, chamber=chamber) else: bill.add_sponsor(spontype, sponsor, official_type=official_type) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(session, bill, votes_url) self.save_bill(bill)
def scrape(self, session, chambers): urlified_session_id = session.replace(':', '-') url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projets-loi-%s.html' % urlified_session_id html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # scrape all the actions for this session actions = self.scrape_actions(urlified_session_id) for row in doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr'): id_td, details_td = row.xpath('td')[:2] bill_id = clean_spaces(id_td.text_content()) pdf_link = details_td.xpath('p[@class="lienAssocie"]//a')[0] bill_name = clean_spaces(pdf_link.text_content()) pdf_url = pdf_link.xpath('@href')[0] detail_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projet-loi-%s-%s.html' % (bill_id, urlified_session_id) bill = Bill(session, 'lower', bill_id, bill_name) bill.add_source(url) bill.add_source(detail_url) bill.add_source(pdf_url) # add actions for action in actions[bill_id]: bill.add_action('lower', action['name'], action['date']) # get sponsors self.scrape_details(bill, detail_url) self.save_bill(bill)
def scrape2001(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect("table center table") # Bill name = tables[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions center = page.cssselect("table center")[-1] for row in center.cssselect("table table")[0].cssselect("tr")[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if "/" not in date: continue if action_text.startswith("Senate"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("upper", action_text, date) elif action_text.startswith("House"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("lower", action_text, date) # Versions for row in center.cssselect("table table")[1].cssselect("a"): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href"))) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill["subjects"] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if "otherAuth" in link.attrib["id"]: bill.add_sponsor("coauthor", name) else: bill.add_sponsor("author", name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib["href"] if "COMMITTEE REPORTS" in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib["href"])) self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' # main bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'], type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if bill_data['LONGTITLE']: bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) self.scrape_html(bill) self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {}), ) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber ) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill['subjects'] = self._subject_mapping[bill_id] bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor, chamber=chamber) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: kwargs = {} if 'committee' in action: kwargs['committees'] = action['committees'] bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type'], **kwargs) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: if 'resolution' in version_html.response.url: bill.add_version('resolution text', version_html.response.url, mimetype='text/html') else: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'), mimetype='text/html') bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape(self, session, chambers): # Get the progress table. url = 'http://www.assembly.nl.ca/business/bills/ga47session1.htm' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) for tr in doc.xpath('//table[@class="bills"]/tr')[1:]: bill_id = clean_spaces(tr[0].text_content()).strip('*') if not bill_id: break # empty rows extend past actual list of bills if bill_id.endswith("."): bill_id = bill_id[:-1] title = clean_spaces(tr[1].text_content()) chapter = tr[-1].text_content() bill = Bill(session, 'lower', bill_id, title, type='bill') if chapter: bill['chapter'] = chapter # FIXME need to do more work to figure out what # version the text *really* is td = tr[1] bill_url = td.xpath('a/@href') if bill_url: bill.add_version(url=bill_url.pop(), name='First Reading', mimetype='text/html') # Actions and version urls. data = zip([ 'First Reading', 'Second Reading', 'Committee', 'Amendments', 'Third Reading', 'Royal Assent', 'Act'], tr[2:-1]) for action, td in data: date_text = td.text_content() date = None fmt = r'%b. %d/%Y' try: date = datetime.datetime.strptime(date_text, fmt) except ValueError: continue else: break if date is None: continue attrs = dict(action=action, date=date, actor='lower') attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) bill.add_source(url) self.save_bill(bill)
def scrape_xml(self, chamber, session): start_letter = "S" if chamber == "upper" else "H" sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"} version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session xml = self.urlopen(summary_url) doc = lxml.etree.fromstring(xml) for bxml in doc.xpath("//Bill"): type = bxml.get("Type") # if this is from the other chamber skip it if not type.startswith(start_letter): continue bill_id = type + bxml.get("Num") + bxml.get("Suffix") if type in ("HB", "SB"): type = "bill" elif type in ("HR", "SR"): type = "resolution" else: raise ValueError("unknown type: %s" % type) # use short_title as title and long as description title = bxml.xpath("Short_Title/text()")[0] description = bxml.xpath("Title/text()")[0] bill = Bill(session, chamber, bill_id, title, type=type, description=description) bill.add_source(summary_url) for sponsor in bxml.xpath("Sponsor"): sponsor_name, code = sponsor.text.rsplit(" ", 1) sponsor_name = sponsor_name.replace(",", ", ") bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code) for version in bxml.xpath("Versions/Version"): # NOTE: it is possible to get PDF versions by using .get('Id') # ex. URL: legis.ga.gov/Legislation/20112012/108025.pdf # for now we just get HTML description, file_id = version.xpath("*/text()") bill.add_version(description, version_url + file_id) for action in bxml.xpath("StatusHistory/Status"): date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S") code = action.get("StatusCode") if code in ("EFF", "Signed Gov"): actor = "executive" elif code[0] == "S": actor = "upper" elif code[0] == "H": actor = "lower" atype = self._action_codes[code] bill.add_action(actor, action.text, date, atype) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): page = lxml.html.fromstring(self.urlopen(url)) title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if 'otherAuth' in link.attrib['id']: bill.add_sponsor('coauthor', name) else: bill.add_sponsor('author', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib['href'] if 'COMMITTEE REPORTS' in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib['href'])) self.save_bill(bill)
def scrape_bill_page(self, chamber, session, bill_url, bill_type): page = self.lxmlize(bill_url) author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()") sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"]) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr") bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(bill_url) authors.remove(author) bill.add_sponsor("primary", author) for author in authors: bill.add_sponsor("cosponsor", author) for digest in digests: bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf") for version in versions: bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf") flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]} for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] date += "/%s" % (session) # Session is April --> June. Prefiles # look like they're in January at earliest. date = dt.datetime.strptime(date, "%m/%d/%Y") chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] if cat == []: cat = ["other"] bill.add_action(chamber, text, date, cat) self.save_bill(bill)
def scrape_bill(self, term, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0: chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text if '*' in chamber1: bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = chamber2.replace(' ', '') else: bill_id = chamber2.replace(' ', '')[1:len(chamber2)] secondary_bill_id = chamber1.replace(' ', '') primary_chamber = 'lower' if 'H' in bill_id else 'upper' else: primary_chamber = 'lower' if 'H' in chamber1 else 'upper' bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = None title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') #NEED TO ADD SECONDARY ACTIONS bill.add_action(primary_chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = "Senate" if chamber == "upper" else "House" with self.urlopen( ksapi.url + "bill_status/" ) as bill_request: # perhaps we should save this data so we can make on request for both chambers? bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: # filtering out other chambers bill_equal_chamber = False for history in bill_data["HISTORY"]: if history["chamber"] == chamber_name: bill_is_in_chamber = True if not bill_is_in_chamber: continue # main bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"]) bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower()) if bill_data["LONGTITLE"]: bill.add_title(bill_data["LONGTITLE"]) bill.add_document("apn", ksapi.ksleg + bill_data["apn"]) bill.add_version("Latest", ksapi.ksleg + bill_data["apn"]) for sponsor in bill_data["SPONSOR_NAMES"]: bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor) for event in bill_data["HISTORY"]: if "committee_names" in event and "conferee_names" in event: actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"]) elif "committee_names" in history: actor = " and ".join(bill_data["committee_names"]) elif "conferee_names" in history: actor = " and ".join(bill_data["conferee_names"]) else: actor = "upper" if chamber == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") bill.add_action(actor, event["status"], date) if event["action_code"] in ksapi.voted: votes = votes_re.match(event["status"]) if votes: vote = Vote( chamber, date, votes.group(1), event["action_code"] in ksapi.passed, int(votes.group(2)), int(votes.group(3)), 0, ) vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower()) bill.add_vote(vote) self.save_bill(bill)
def scrape_bill(self, chamber, term, bill_id, url, title, subject=None): self.logger.info('GET ' + url) resp = self.get(url) html = resp.text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) bill = Bill(term, chamber, bill_id, title) bill.add_source(url) if subject is not None: bill['subjects'] = [subject] # Sponsors sponsor_map = { 'author': 'primary', 'co-author': 'cosponsor', 'sponsor': 'cosponsor', 'co-sponsor': 'cosponsor', } for div in doc.xpath('//div[contains(@class, "bill-author-info")]'): name = div.xpath('string(b)').strip() sp_type = sponsor_map[div.xpath('string(p)').strip().lower()] bill.add_sponsor(sp_type, name) # Actions for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]: if li.text_content() == 'None currently available.': continue chamber_str = li.xpath('string(strong)').strip() action_chamber = dict(H='lower', S='upper')[chamber_str] action_date = li.xpath('string(span[@class="document-date"])') # Some resolution actions have no dates. if not action_date.strip(): continue action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y') action_text = li.xpath('string(span[2])').strip() if not action_text.strip(): continue kwargs = dict(date=action_date, actor=action_chamber, action=action_text) kwargs.update(**self.categorizer.categorize(action_text)) bill.add_action(**kwargs) # Documents (including votes) for doc_type, doc_meta in BillDocuments(self, doc): if doc_type == 'version': bill.add_version( doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'document': bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'rollcall': self.add_rollcall(chamber, bill, doc_meta) self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.save_bill(bill)
def scrape_bill(self, chamber, session, billid, histurl, year): if year[0] != 'R': session = year else: session = self.metadata['session_details'][year][ 'sub_sessions'][int(year[0]) - 1] with self.urlopen(histurl) as data: soup = BeautifulSoup(cleansource(data)) basicinfo = soup.findAll('div', id='bhistleft')[0] hist = basicinfo.table sponsor = None title = None for b in basicinfo.findAll('b'): if b.next.startswith('SUMMARY'): title = b.findNextSiblings(text=True)[0].strip() elif b.next.startswith('SPONSOR'): for a in b.findNextSiblings('a'): if not issponsorlink(a): break sponsor = cleansponsor(a.contents[0]) bill = Bill(session, chamber, billid, title) if sponsor: bill.add_sponsor('primary', sponsor) for row in hist.findAll('tr'): link = row.td.a vlink = urlbase % link['href'] vname = link.contents[0].strip() bill.add_version(vname, vlink) history = soup.findAll('div', id='bhisttab')[0].table rows = history.findAll('tr')[1:] for row in rows: tds = row.findAll('td') if len(tds) < 2: # This is not actually an action continue date, action = row.findAll('td')[:2] date = dt.datetime.strptime(date.contents[0], '%m/%d/%y') action = action.contents[0].strip() if 'House' in action: actor = 'lower' elif 'Senate' in action: actor = 'upper' else: # for lack of a better actor = chamber bill.add_action(actor, action, date) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) # split "SB1 SD2 HD2" to get SB1 bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0] title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ') subjects = [s.strip() for s in subjects if s.strip()] description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text referral = page.xpath('//span[contains(@id, "referral")]/text()')[0] bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type, description=description, referral=referral) for sponsor in sponsors.split(', '): if sponsor.endswith(' (BR)'): sponsor = sponsor[:-5] bill.add_sponsor('primary', sponsor) # actions actions = [] table = page.xpath('//table[@id="GridViewStatus"]')[0] for row in table.xpath('tr'): action_params = {} cells = row.xpath('td') if len(cells) == 3: ch = cells[1].xpath('font')[0].text action_params['actor'] = house[ch] action_params['action'] = cells[2].xpath('font')[0].text action_date = cells[0].xpath('font')[0].text action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y") action_params['type'] = categorize_action(action_params['action']) actions.append(action_params) for action_params in actions: bill.add_action(**action_params) self.parse_vote(bill, action_params['action'], action_params['actor'], action_params['date']) # add versions try: for version in page.xpath('//a[contains(@id, "StatusLink")]'): bill.add_version(version.text.replace('_', ' '), version.get('href')) except IndexError: # href not found. pass bill.add_source(bill_url) self.save_bill(bill)
def scrape_bill_status_page(self, url, params={}): """Scrapes the status page url, populating parameter dict and returns bill """ with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) params["bill_id"] = page.xpath('//h3[contains(@class, "center")]/a')[0].text.split()[0] params["title"] = page.xpath( '//div[div[contains( \ ., "Report Title")]]/div[contains(@class, "rightside")]' )[0].text.strip() sponsors = page.xpath( '//div[div[contains( \ ., "Introducer")]]/div[contains(@class, "rightside")]' )[0].text subject = page.xpath( '//div[div[contains( \ ., "Measure Title")]]/div[contains(@class, "rightside")]' )[0].text.strip() subject = subject.replace("RELATING TO ", "") # Remove lead text params["subject"] = subject.replace(".", "") params["description"] = page.xpath( '//div[div[contains( \ ., "Description")]]/div[contains(@class, "rightside")]' )[0].text params["companion"] = page.xpath( '//div[div[contains( \ ., "Companion")]]/div[contains(@class, "rightside")]' )[0].text if params["title"] == "": params["title"] = params["subject"] actions = [] table = page.xpath('//table[tr/th[contains(., "Date")]]')[0] for row in table.xpath("tr[td]"): # Ignore table header row action_params = {} cells = row.xpath("td") if len(cells) == 3: ch = cells[1].text action_params["actor"] = house[ch] action_params["action"] = cells[2].text action_date = cells[0].text.split()[0] # Just get date, ignore any time. try: action_params["date"] = datetime.strptime(action_date, "%m/%d/%y") except ValueError: # Try a YYYY format. action_params["date"] = datetime.strptime(action_date, "%m/%d/%Y") actions.append(action_params) bill = Bill(**params) bill.add_sponsor("primary", sponsors) for action_params in actions: bill.add_action(**action_params) self.save_bill(bill) return bill
def scrape_bill(self, chamber, session, doc_type, url): doc = self.url_to_doc(url) # bill id, title, synopsis bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip() synopsis = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, synopsis=synopsis) bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() bill.add_action(actor, action, date, **_categorize_action(action)) if action.lower().find('sponsor') != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype,sponsor,chamber in sponsor_list: if chamber: bill.add_sponsor(spontype, sponsor, chamber=chamber) else: bill.add_sponsor(spontype, sponsor) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(session, bill, votes_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) # split "SB1 SD2 HD2" to get SB1 bill_id = page.xpath('//a[@class="headerlink"]')[0].text.split()[0] table = page.xpath('//table[@cellspacing="4px"]')[0] title = get_table_text(table, "Measure Title") subjects = get_table_text(table, "Report Title").split('; ') description = get_table_text(table, "Description") sponsors = get_table_text(table, "Introducer(s)") bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type, description=description) for sponsor in sponsors.split(', '): if sponsor.endswith(' (BR)'): sponsor = sponsor[:-5] bill.add_sponsor('primary', sponsor) # actions actions = [] table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr'): action_params = {} cells = row.xpath('td') if len(cells) == 3: ch = cells[1].xpath('font')[0].text action_params['actor'] = house[ch] action_params['action'] = cells[2].xpath('font')[0].text action_date = cells[0].xpath('font')[0].text action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y") action_params['type'] = categorize_action(action_params['action']) actions.append(action_params) for action_params in actions: bill.add_action(**action_params) self.parse_vote(bill, action_params['action'], action_params['actor'], action_params['date']) # Add version document if not on a javascript link. try: bill_version = page.xpath('//a[contains(@id, "HyperLinkPDF")]')[0].attrib['href'] bill.add_version('Current version', bill_version) except IndexError: # href not found. pass bill.add_source(bill_url) self.save_bill(bill)
def scrape(self, session, chambers): url = 'http://www.legassembly.sk.ca/legislative-business/bills/' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) url = doc.xpath('//a[text() = "Progress of Bills"]/@href').pop() filename, resp = self.urlretrieve(url) doc = pdf_to_lxml(filename) actions = [ 'First Reading', 'Crown recommendation', 'Committee', 'Second Reading', 'Committee', 'Amend Date', 'Third Reading', 'Royal Assent', 'In Effect' ] for a in doc.xpath('//a[contains(@href, "legdocs/Bills")]'): bill_id = a.text_content().strip() predicate = lambda el: el.tag == 'br' sibs = list(takewhile(predicate, a.itersiblings())) # If the star is missing, insert it to avoid complicated code. if not sibs[0].tail.strip() == '*': sibs.insert(0, DummyBR('br', None, '*')) title_chunks = [sibs[1].tail.strip()] sponsor = sibs[2].tail.strip() dates = sibs[3].tail.split(u'\xa0') title_chunks.extend((br.tail or '').strip() for br in sibs[4:]) title = ' '.join(title_chunks).strip() bill = Bill(session, 'lower', bill_id, title, type='bill') bill.add_sponsor(name=sponsor, type='primary') for action, date in zip(actions, dates): date = datetime.datetime.strptime(date.strip(), '%Y-%m-%d') attrs = dict(action=action, date=date, actor='lower') attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) bill.add_source(url) bill.add_version('Introduced', a.attrib['href'], mimetype='application/pdf') self.save_bill(bill)
def scrape_bill(self, chamber, session, doc_type, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # bill id, title, synopsis bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip() synopsis = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, synopsis=synopsis) # sponsors for sponsor in doc.xpath('//a[@class="content"]/text()'): bill.add_sponsor('cosponsor', sponsor) # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() # TODO: categorize actions bill.add_action(actor, action, date) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(bill, votes_url) bill.add_source(votes_url) bill.add_source(url) self.save_bill(bill)
def scrape1995(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip() bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/'), mimetype='text/html') # Sponsorships rows = page.cssselect('center table tr') for row in rows: if row.text_content().strip() == 'Sponsor and CoSponsors': continue if row.text_content().strip() == 'Links / Committees / Status': break for a in row.cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions # The actions are in a pre table that looks like: """ SENATE HOUSE ------------------------------------- 1/13/95 Read 1st time 2/6/95 1/31/95 Favorably Reported 2/1/95 Read 2nd Time 2/7/95 2/3/95 Read 3rd Time 2/3/95 Passed/Adopted """ actions = page.cssselect('pre')[0].text_content().split('\n') actions = actions[2:] for action in actions: senate_date = action[:22].strip() action_text = action[23:46].strip() house_date = action[46:].strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' # main bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'], type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if bill_data['LONGTITLE']: bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime( event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) self.scrape_html(bill) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if 'otherAuth' in link.attrib['id']: bill.add_sponsor('cosponsor', name) else: bill.add_sponsor('primary', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) bill.add_action(**attrs) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib['href'] if 'COMMITTEE REPORTS' in version_url: continue name = link.text.strip() bill.add_version(name, version_url, mimetype='application/msword') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill['title'] == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. self.save_bill(bill)
def get_bill_info(self, session, bill_id): bill_detail_url = 'http://www.ncga.state.nc.us/gascripts/'\ 'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s' % ( session, bill_id) if bill_id[0] == 'H': chamber = 'lower' else: chamber = 'upper' # parse the bill data page, finding the latest html text with self.urlopen(bill_detail_url) as data: doc = lxml.html.fromstring(data) title_div_txt = doc.xpath('//div[@id="title"]/text()')[0] if 'Joint Resolution' in title_div_txt: bill_type = 'joint resolution' bill_id = bill_id[0] + 'JR ' + bill_id[1:] elif 'Resolution' in title_div_txt: bill_type = 'resolution' bill_id = bill_id[0] + 'R ' + bill_id[1:] elif 'Bill' in title_div_txt: bill_type = 'bill' bill_id = bill_id[0] + 'B ' + bill_id[1:] title_style_xpath = '//div[@style="text-align: center; font: bold 20px Arial; margin-top: 15px; margin-bottom: 8px;"]/text()' bill_title = doc.xpath(title_style_xpath)[0] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == 'lower': link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace(u'\xa0', ' ') # but neighboring span with anchor inside has the HTML version version_url = vlink.xpath('./following-sibling::span/a/@href') version_url = 'http://www.ncga.state.nc.us' + version_url[0] bill.add_version(version_name, version_url) # sponsors pri_td = doc.xpath('//th[text()="Primary:"]/following-sibling::td') pri_text = pri_td[0].text_content().replace(u'\xa0', ' ').split('; ') for leg in pri_text: leg = leg.strip() if leg: if leg[-1] == ';': leg = leg[:-1] bill.add_sponsor('primary', leg) # cosponsors co_td = doc.xpath('//th[text()="Co:"]/following-sibling::td') co_text = co_td[0].text_content().replace(u'\xa0', ' ').split('; ') for leg in co_text: leg = leg.strip() if leg and leg != 'N/A': if leg[-1] == ';': leg = leg[:-1] bill.add_sponsor('cosponsor', leg) # actions action_tr_xpath = '//td[starts-with(text(),"History")]/../../tr' # skip two header rows for row in doc.xpath(action_tr_xpath)[2:]: tds = row.xpath('td') act_date = tds[0].text actor = tds[1].text or '' action = tds[2].text.strip() act_date = dt.datetime.strptime(act_date, '%m/%d/%Y') if actor == 'Senate': actor = 'upper' elif actor == 'House': actor = 'lower' else: actor = 'executive' for pattern, atype in self._action_classifiers.iteritems(): if action.startswith(pattern): break else: atype = 'other' bill.add_action(actor, action, act_date, type=atype) if self.is_latest_session(session): subj_key = bill_id[0] + ' ' + bill_id.split(' ')[-1] bill['subjects'] = self.subject_map[subj_key] self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): # try and get bill for current year url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.urlopen(url) # if first page isn't found, try second year if 'Page Not Found' in html: html = self.urlopen('http://legislature.mi.gov/doc.aspx?%s-%s' % (session[-4:], bill_id.replace(' ', '-'))) if 'Page Not Found' in html: return None doc = lxml.html.fromstring(html) title = doc.xpath( '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(session=session, chamber=chamber, bill_id=bill_id, title=title, type=bill_type) bill.add_source(url) # sponsors sp_type = 'primary' for sponsor in doc.xpath( '//span[@id="frg_billstatus_SponsorList"]/a/text()'): sponsor = sponsor.replace(u'\xa0', ' ') bill.add_sponsor(sp_type, sponsor) sp_type = 'cosponsor' bill['subjects'] = doc.xpath( '//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath( '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = datetime.datetime.strptime(date, "%m/%d/%Y") # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' type = categorize_action(action) bill.add_action(actor, action, date, type=type) # check if action mentions a vote rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) vote = Vote(actor, date, action, False, 0, 0, 0) self.parse_roll_call(vote, vote_url, rc_num) # check the expected counts vs actual count = re.search('YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['yes_votes']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['yes_votes']))) count = re.search('NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['no_votes']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['no_votes']))) vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(vote_url) bill.add_vote(vote) else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath( '//table[@id="frg_billstatus_DocumentGridTable"]/tr'): version = self.parse_doc_row(row) if version: if version[1].endswith('.pdf'): mimetype = 'application/pdf' elif version[1].endswith('.htm'): mimetype = 'text/html' bill.add_version(*version, mimetype=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) self.save_bill(bill) return True
def scrape(self, chamber, session): # for the chamber of the action chamber_map = {'House': 'lower', 'Senate':'upper', 'Joint': 'joint'} session_slug = session[:-2] chamber_slug = 'House' if chamber == 'lower' else 'Senate' # keep track of how many we've had to skip skipped = 0 for n in itertools.count(1): bill_id = '%s%05d' % (chamber_slug[0], n) bill_url = 'http://www.malegislature.gov/Bills/%s/%s/%s' % ( session_slug, chamber_slug, bill_id) with self.urlopen(bill_url) as html: # sometimes the site breaks if '</html>' not in html: self.warning('truncated page on %s' % bill_url) continue # lets assume if 10 bills are missing we're done if skipped == 10: break if 'Unable to find the Bill requested' in html: skipped += 1 # no such bill continue else: skipped = 0 doc = lxml.html.fromstring(html) doc.make_links_absolute('http://www.malegislature.gov/') title = doc.xpath('//h2/text()')[0] desc = doc.xpath('//p[@class="billShortDesc"]/text()')[0] # create bill bill = Bill(session, chamber, bill_id, title, description=desc) bill.add_source(bill_url) # actions for act_row in doc.xpath('//tbody[@class="bgwht"]/tr'): date = act_row.xpath('./td[@headers="bDate"]/text()')[0] date = datetime.strptime(date, "%m/%d/%Y") actor_txt = act_row.xpath('./td[@headers="bBranch"]')[0].text_content().strip() if actor_txt: actor = chamber_map[actor_txt] action = act_row.xpath('./td[@headers="bAction"]/text()')[0].strip() atype = classify_action(action) bill.add_action(actor, action, date, type=atype) # I tried to, as I was finding the sponsors, detect whether a # sponsor was already known. One has to do this because an author # is listed in the "Sponsors:" section and then the same person # will be listed with others in the "Petitioners:" section. We are # guessing that "Sponsors" are authors and "Petitioners" are # co-authors. Does this make sense? sponsors = dict((a.get('href'), a.text) for a in doc.xpath('//p[@class="billReferral"]/a')) petitioners = dict((a.get('href'), a.text) for a in doc.xpath('//div[@id="billSummary"]/p[1]/a')) # remove sponsors from petitioners for k in sponsors: petitioners.pop(k, None) for sponsor in sponsors.values(): bill.add_sponsor('primary', sponsor) for petitioner in petitioners.values(): bill.add_sponsor('cosponsor', petitioner) # sometimes version link is just missing bill_text_url = doc.xpath('//a[@title="Show and Print Bill Text"]/@href') if bill_text_url: bill.add_version('Current Text', bill_text_url[0]) self.save_bill(bill)
def scrape(self, chamber, session): # URL building if chamber == 'upper': url_chamber_name = 'senate' norm_chamber_name = 'Senate' else: url_chamber_name = 'house' norm_chamber_name = 'House' assembly_url = '/assembly/%s' % session chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name) list_url = self.site_root + assembly_url + chamber_url # Parsing with self.urlopen(list_url) as data: soup = self.parser.parse(data) if not soup: raise ScrapeError('Failed to parse legaslative list page.') table = soup.find('table', summary=norm_chamber_name + ' Bills') bill_links = table.findAll('a', href=re.compile('bill-actions')) indexed_bills = {} for link in bill_links: # Populate base attributes attributes = { 'session': session, 'chamber': chamber, } bill_number = link.contents[0] if not re.match('^[0-9]{4}$', bill_number): raise ScrapeError('Bill number not in expected format.') # ND bill prefixes are coded numerically if bill_number[0] == '1': bill_prefix = 'HB' elif bill_number[0] == '2': bill_prefix = 'SB' elif bill_number[0] == '3': bill_prefix = 'HCR' elif bill_number[0] == '4': bill_prefix = 'SCR' elif bill_number[0] == '5': bill_prefix = 'HR' elif bill_number[0] == '6': bill_prefix = 'SR' elif bill_number[0] == '7': bill_prefix = 'HMR' elif bill_number[0] == '8': bill_prefix = 'SMR' attributes['bill_id'] = bill_prefix + ' ' + bill_number # Skip duplicates (bill is listed once for each version) if attributes['bill_id'] in indexed_bills.keys(): continue self.debug(attributes['bill_id']) # Parse details page attributes.update( self.scrape_bill_details(assembly_url, bill_number)) # Create bill bill = Bill(**attributes) # Parse actions (actions, actions_url) = self.scrape_bill_actions( assembly_url, bill_number, session) bill.add_source(actions_url) for action in actions: bill.add_action(**action) # Parse versions (versions, versions_url) = self.scrape_bill_versions( assembly_url, bill_number) bill.add_source(versions_url) for version in versions: bill.add_version(**version) # Add bill to dictionary, indexed by its id indexed_bills[attributes['bill_id']] = bill # Parse sponsorship data (sponsors, sponsors_url) = self.scrape_bill_sponsors(assembly_url) for bill_id, sponsor_list in sponsors.items(): for sponsor in sponsor_list: # Its possible a bill was misnamed somewhere... but thats # not a good enough reason to error out if bill_id in indexed_bills.keys(): bill = indexed_bills[bill_id] bill.add_sponsor(**sponsor) bill.add_source(sponsors_url) # Save bill for bill in indexed_bills.values(): self.save_bill(bill)
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill(session, bill_chamber, bill_id, title, type=bill_type, summary=bill_data['summary']) if bill_data['title'] is None: bill['title'] = bill_data['summary'] bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor']['rules'] == True: bill.add_sponsor('primary', 'Rules Committee', chamber=bill_chamber) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsor('primary', primary_sponsor['shortName']) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsor('cosponsor', cosponsor['shortName']) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Determine companion bill chamber. companion_bill_prefix = self._parse_bill_number( same_as['items'][0]['basePrintNo'])[0] companion_bill_chamber = self._parse_bill_prefix( companion_bill_prefix)[0] # Attach companion bill data. bill.add_companion( companion_bill_id, companion_bill_session, companion_bill_chamber, ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime( action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, attrs = NYBillScraper.categorizer.categorize(action['text']) bill.add_action(chamber, action['text'], action_date, type=types, **attrs) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: vote = self._parse_senate_votes(vote_data) bill.add_vote(vote) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() assembly_bill_data = assembly.bill # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.iteritems(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version(html_version, html_url, on_duplicate='use_new', mimetype='text/html') pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version(pdf_version, pdf_url, on_duplicate='use_new', mimetype='application/pdf') # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. bill.add_source(self.api_client.root + self.api_client.\ resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='')) bill.add_source(senate_url) bill.add_source(assembly_url) return bill
def scrape(self, session, chambers): HTML_TAGS_RE = r'<.*?>' year_slug = session[5:] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.iteritems()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError("Unknown bill type found: '{}'".format( info['BillNumber'])) # Create the bill using its basic information bill = Bill(session=session, bill_id=info['BillNumber'], title=info['Title'], chamber=bill_chamber, type=bill_type) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = \ 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li') sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5): bill.add_sponsor(sponsor_type, sponsor_name) # Capture bill text versions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a') for version in versions: bill.add_version(name=version.xpath('text()')[0], url=version.xpath('@href')[0].replace( ' ', '%20'), mimetype='application/pdf') # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format( year_slug), lxml.etree.tostring(doc)).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".\ format(info['BillNumber'])) self.save_bill(bill) continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.iteritems()} if "Signed by Governor" in action['FullStatus']: actor = 'governor' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'governor:signed' elif actor == 'lower' and \ any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("H") elif actor == 'upper' and \ any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("S") else: action_type = 'other' bill.add_action(actor=actor, action=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strptime( action['StatusDate'], '%m/%d/%Y'), type=action_type) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\ format(year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\ format(year_slug, roll_call_id) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_other = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_other.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif "Failed -- " in vote['FullStatus']: did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = \ int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = \ int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = Vote(chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), date=datetime.datetime.strptime( vote['StatusDate'], '%m/%d/%Y'), motion=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), passed=did_pass, yes_count=yea_count, no_count=nay_count, other_count=len(roll_call_other)) vote_to_add.add_source(roll_call_url) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_other: vote_to_add.other(member) try: vote_to_add.validate() except ValueError as e: self.warning(e) bill.add_vote(vote_to_add) # Capture extra information # This is not in the OpenStates spec, but is available # Not yet implemented # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} self.save_bill(bill)
def scrape_details(self, bill_detail_url, session, chamber, bill_id, page): data = page pat1 = re.compile(r'</FORM>') results = pat1.search(data) if not results: raise ScrapeError("scrape_details(1) - unable to parse |%s|" % bill_detail_url) pre_start = page.find("<pre>", results.start()) if pre_start == -1: self.warning( "scrape_details(2) - unable to parse (no <pre>) |%s|\n|%s|" % (bill_detail_url, page)) return pre_stop = page.find("</pre>", pre_start) if pre_stop == -1: raise ScrapeError( "scrape_details(3) - unable to parse (no </pre>) %s" % bill_detail_url) pre_section = page[pre_start:pre_stop] data = pre_section vurl = None action_line_re = re.compile(r'(\d\d/\d\d/\d\d)\s+(\w+)\s+(.+)') pat2 = re.compile(r' By ') results = pat2.search(data) if results != None: bystuff = data[results.start():results.end()] data = data[results.end():] pat3 = re.compile(r'</b>') results1 = pat3.search(data) newspon = [] if results != None and results1 != None: spondata = data[:results1.start()] mysponsors = sponsorsToList(spondata) for s in mysponsors: newspon.append(s) data = data[results1.end():] apat = re.compile(">(H|S) (\d*)<") billpat = re.compile("(\d+)") bill_number = billpat.search(bill_id).group(0) (similar_bills, summary, after_summary, vurl) = self.split_page_into_parts(data, session, bill_number) bill_summary = summary.strip().decode('utf8', 'ignore') bill = Bill(session, chamber, bill_id, bill_summary, type=bill_type(bill_summary)) linenum = 0 for line in after_summary.splitlines(): #get rid of the parenthesis action_line = line.partition("(")[0].strip() #r1 = action_line_re.search(action_line) r = action_line_re.search(action_line) if r: the_date = r.group(1) action_chamber = r.group(2) action = r.group(3) date = datetime.datetime.strptime(the_date, "%m/%d/%y") date = date.date() t = action_type(action) if t == ['other']: self.debug("OTHERACTION: bill %s %d Text[%s] line[%s]" % (bill_id, linenum, action, line)) else: self.debug("ACTION: %s %d dt|ch|action [%s|%s|%s] [%s]" % (bill_id, linenum, the_date, action_chamber, action, str(t))) bill.add_action(chamber, action, date, t) elif len(line) > 0: self.debug("Skipping line %d [%s] line:[%s]" % (linenum, bill_id, line)) linenum += 1 if similar_bills: bill['similar'] = similar_bills bill.add_source(bill_detail_url) for sponsor in newspon: bill.add_sponsor("sponsor", sponsor) if vurl: try: self.scrape_vote_history(vurl, chamber, bill, bill_id) bill.add_source(vurl) self.debug("Scraped votes: (chamber=%s,bill=%s,url=%s)" % (chamber, bill_id, vurl)) except Exception as error: self.warning( "Failed to scrape votes: chamber=%s bill=%s vurl=%s %s" % (chamber, bill_id, vurl, traceback.format_exc())) self.save_bill(bill)
def scrape_bill(self, link, chamber, session): legislation_types = { 'House Bill': 'HB', 'House Concurrent Resolution': 'HCR', 'House Joint Resolution': 'HJR', 'House Resolution': 'HR', 'Senate Bill': 'SB', 'Senate Concurrent Resolution': 'SCR', 'Senate Joint Resolution': 'SJR', 'Senate Resolution': 'SR', } base_url = "http://legis.delaware.gov" text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open" try: page = self.lxmlize(link) except scrapelib.HTTPError: self.logger.warning('404. Apparently the bill hasn\'t been posted') return nominee = page.xpath(".//div[@id='page_header']/text()")[0] if nominee.strip().lower() == "nominee information": self.logger.info("Nominee, skipping") return bill_id = page.xpath(".//div[@align='center']") try: bill_id = bill_id[0].text_content().strip() except IndexError: self.logger.warning("Can't find bill number, skipping") return #some bill_ids include relevant amendments #in the form "SB 10 w/SA1", so we fix it here bill_id = bill_id.split("w/")[0] bill_id = bill_id.split("(")[0] leg_type = None for long_name, short_name in legislation_types.items(): if long_name in bill_id: leg_type = short_name bill_num = bill_id.replace(long_name, "").strip() break if leg_type: bill_id = leg_type + " " + bill_num elif "for" in bill_id: bill_id = bill_id.split("for")[1] else: self.logger.warning("Unknown bill type for {}".format(bill_id)) return bill_id = bill_id.replace(' ', "") bill_id = bill_id.strip() #each row is in its own table #there are no classes/ids or anything, so we're going to loop #through the individual tables and look for keywords #in the first td to tell us what we're looking at tables = page.xpath('.//div[@id="page_content"]/table') bill_documents = {} action_list = [] vote_documents = {} sub_link = None bill_text_avail = False for table in tables: tds = table.xpath('.//td') if len(tds) == 0: #some kind of empty table for formatting reasons continue title_text = tds[0].text_content().strip().lower() if title_text.startswith('primary sponsor'): pri_sponsor_text = tds[1].text_content() primary_sponsors = self.separate_names(pri_sponsor_text) #sometimes additional sponsors are in a 3rd td #other times the 3rd td contains a blank image addl_sponsors = [] add_spons_text = tds[2].text_content().strip() if add_spons_text: add_spons_text = add_spons_text.replace( "Additional Sponsor(s):", "") if not "on behalf of all representatives" in add_spons_text.lower( ): addl_sponsors = self.separate_names(add_spons_text) elif title_text.startswith('co-sponsor'): cosponsor_text = tds[1].text_content() if "none..." in cosponsor_text.lower(): cosponsors = [] continue cosponsors = self.separate_names(cosponsor_text) elif title_text.startswith('long title'): bill_title = tds[1].text_content().strip() elif title_text.startswith('amendment'): amendments = tds[1].xpath('.//a') for a in amendments: amm = a.text self.logger.debug(amm) amm_text = "Amendment".format(amm.strip()) amm_slg = "+".join(amm.split()) amm_link = text_base_url.format(session=session, bill_id=amm_slg) bill_documents[amm_text] = amm_link amm_page = self.lxmlize(a.attrib["href"]) for tr in amm_page.xpath('//tr'): tds = tr.xpath("./td") if len(tds) > 1: if "voting" in tds[0].text_content().lower(): self.find_vote(tds, vote_documents, "Amendment: ") elif title_text.startswith('engrossed version'): if tds[1].text_content().strip(): engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open" engrossment_link = engrossment_base.format( session=session, bill_id="+".join(bill_id.split())) if bill_url not in bill_documents.values(): bill_documents["Engrossed Version"] = engrossment_link elif title_text.startswith('substituted'): content = tds[1].text_content().strip() if ("Substitute" in content and not "Original" in content): sub_link = tds[1].xpath(".//a/@href")[0] elif ("full text" in title_text and ("(" not in title_text or "html" in title_text)): if tds[1].text_content().strip(): #it is totally unclear which version of the bill is referred to here #so I'm just calling it "bill text" bill_url = text_base_url.format(session=session, bill_id=bill_id.replace( " ", "+")) if bill_url not in bill_documents.values(): bill_documents["Bill Text"] = bill_url elif title_text.startswith('fiscal notes'): pass #skipping fiscal notes for now, they are really ugly #but leaving in as a placeholder so we can remember to #do this someday, if we feel like it elif title_text.startswith('committee reports'): pass #the committee reports let a legislator #comment on a bill. They can comment as #"favorable","unfavorable" or "on its merits" #but these are NOT votes (per conversation w #seceretary of the DE senate 3/16/15). The bill is #considered if the majority sign it, which will #appear in the bill's action history as being #reported out of committee elif title_text.startswith('voting'): self.find_vote(tds, vote_documents) elif title_text.startswith('actions history'): action_list = tds[1].text_content().split("\n") sub_versions = [] use_sub = False if sub_link: bill = self.scrape_bill(sub_link, chamber, session) if bill: sub_versions = [v["url"] for v in bill["versions"]] bill.add_title(bill_id) use_sub = True if not use_sub: bill = Bill(session, chamber, bill_id, bill_title) for s in primary_sponsors: bill.add_sponsor("primary", s) for s in addl_sponsors: #it is not totally clear whether "additional sponsors" #are co or primary but primary is my best guess #based on the bill text, bc they're on the first #line with the primary sponsor bill.add_sponsor("primary", s) for s in cosponsors: bill.add_sponsor("cosponsor", s) for name, doc_link in bill_documents.items(): if "Engrossment" in name or "Bill Text" in name: if doc_link not in sub_versions: bill.add_version(name, doc_link, mimetype="text/html") else: pass bill.add_document(name, doc_link, mimetype="text/html") for a in action_list: if a.strip(): date, action = a.split('-', 1) try: date = datetime.strptime(date.strip(), '%b %d, %Y') except ValueError: date = datetime.strptime(date.strip(), '%B %d, %Y') # XXX: ugh. action = action.strip() actor = actions.get_actor(action, bill['chamber']) attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) attrs["action"] = " ".join(attrs["action"].split()) bill.add_action(**attrs) for name, doc in vote_documents.items(): vote_chamber = "lower" if "house" in name.lower() else "upper" try: self.head(doc) except scrapelib.HTTPError: self.logger.warning("could not access vote document") continue vote_page = self.lxmlize(doc) vote_info = vote_page.xpath(".//div[@id='page_content']/p")[-1] yes_votes = [] no_votes = [] other_votes = [] lines = vote_info.text_content().split("\n") for line in lines: if line.strip().startswith("Date"): date_str = " ".join(line.split()[1:4]) date = datetime.strptime(date_str, "%m/%d/%Y %I:%M %p") passage_status = line.strip().split()[-1] #we've never seen a vote with anything but "passed" #so throw an error otherwise so we can figure it out passed_statuses = ["Passed"] failed_statuses = ["Defeated", "Rescinded"] if passage_status not in passed_statuses + failed_statuses: raise AssertionError( "Unknown passage state {}".format(passage_status)) passed = passage_status in passed_statuses if line.strip().startswith("Vote Type"): if "voice" in line.lower(): voice_vote = True else: voice_vote = False yes_count = int(re.findall("Yes: (\d+)", line)[0]) no_count = int(re.findall("No: (\d+)", line)[0]) other_count = int( re.findall("Not Voting: (\d+)", line)[0]) other_count += int( re.findall("Absent: (\d+)", line)[0]) vote_tds = vote_page.xpath(".//table//td") person_seen = False for td in vote_tds: if person_seen: person_vote = td.text_content().strip() if person_vote == "Y": yes_votes.append(person) elif person_vote == "N": no_votes.append(person) elif person_vote in ["NV", "A", "X", "C"]: other_votes.append(person) else: raise AssertionError( "Unknown vote '{}'".format( person_vote)) person_seen = False else: person = td.text_content().strip() if person: person_seen = True if voice_vote: vote = Vote(vote_chamber, date, "passage", passed, 0, 0, 0) else: vote = Vote(vote_chamber, date, "passage", passed, yes_count, no_count, other_count, yes_votes=[], no_votes=[], other_votes=[]) vote["yes_votes"] = yes_votes vote["no_votes"] = no_votes vote["other_votes"] = other_votes if (passed and vote["yes_count"] <= vote["no_count"] and not voice_vote): raise AssertionError("Vote passed with more N than Y votes?") if not passed and vote["yes_count"] > vote["no_count"]: self.logger.warning("Vote did not pass but had a majority \ probably worth checking") if "Amendment" in name: vote["type"] = "amendment" else: vote["type"] = "passage" vote.add_source(doc) bill.add_vote(vote) bill.add_source(link) return bill
def parse_bill_xml(self, chamber, session, txt): root = lxml.etree.fromstring(txt) bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) bill_title = root.findtext("caption") if session[2] == 'R': session = session[0:2] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': continue if desc == 'Amended': atype = 'amendment:passed' elif desc == 'Amendment(s) offered': atype = 'amendment:introduced' elif desc == 'Amendment amended': atype = 'amendment:amended' elif desc == 'Amendment withdrawn': atype = 'amendment:withdrawn' elif desc == 'Passed' or desc == 'Adopted': atype = 'bill:passed' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'bill:introduced' else: atype = 'other' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'governor:received' elif desc.startswith('Signed by the Governor'): atype = 'governor:signed' elif desc == 'Read first time': atype = ['bill:introduced', 'bill:reading:1'] introduced = True elif desc == 'Read & adopted': atype = 'bill:passed' elif desc.startswith('Referred to') or desc.startswith( "Recommended to be sent to "): atype = 'committee:referred' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee:passed' elif desc == "Filed": atype = 'bill:filed' else: atype = 'other' bill.add_action(actor, action.findtext('description'), act_date, type=atype, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('author', author) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('coauthor', coauthor) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('sponsor', sponsor) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) return bill
def scrape_actions(self, session, href): page = self.lxmlize(href) (bid, ) = page.xpath('//h1[@id="page-title"]/text()') bid = re.sub(r"^Bill Actions for ", "", bid) subjects = self.subjects.get(bid, []) # some pages say "Measure Number Breakdown", others "Bill..." table = page.xpath("//table[contains(@summary, 'Number Breakdown')]") table = table[0] ttrows = page.xpath("//div[@id='application']/p") descr = ttrows[-2] title = re.sub("\s+", " ", descr.text_content()).strip() ttrows = ttrows[:-1] chamber = {"H": "lower", "S": "upper"}[bid[0]] type_ = bid[1:3] bill_type = "bill" if type_.startswith("B"): bill_type = "bill" if type_.startswith("R"): bill_type = "resolution" if type_ == "CR": bill_type = "concurrent resolution" bill = Bill(session, chamber, bid, title, subjects=subjects, type=bill_type) bill.add_source(href) for row in ttrows: if isinstance(row, lxml.html.HtmlComment): continue # ignore HTML comments, no text_content() sponsors = row.text_content().strip() sinf = re.match( "(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)", sponsors) if sinf: sponsors = sinf.groupdict() for sponsor in [ x.strip() for x in sponsors['sponsors'].split(",") ]: bill.add_sponsor('primary', sponsor) dt = None oldchamber = 'other' for row in table.xpath(".//tr"): if row.text_content().strip() == '': continue if "Meeting Description" in [ x.strip() for x in row.xpath(".//th/text()") ]: continue row = row.xpath("./*") row = [x.text_content().strip() for x in row] if len(row) > 3: row = row[:3] date, chamber, action = row try: chamber = {"House": "lower", "Senate": "upper"}[chamber] oldchamber = chamber except KeyError: chamber = oldchamber if date != '': dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y") kwargs = self.categorizer.categorize(action) bill.add_action(chamber, action, dt, **kwargs) version_url = page.xpath("//a[contains(text(), 'Versions')]") if len(version_url) == 1: href = version_url[0].attrib['href'] bill = self.scrape_versions(bill, href) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): sidebar = lxml.html.fromstring(self.urlopen(url)) try: hist_url = get_popup_url( sidebar.xpath("//a[contains(., 'Bill History')]")[0]) except IndexError: # where is it? return page = lxml.html.fromstring(self.urlopen(hist_url)) page.make_links_absolute(hist_url) title = page.xpath("string(//table[2]/tr[4])").strip() if title == '': self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if 'HR' in bill_id or 'SR' in bill_id: bill_type = ['resolution'] elif 'HJR' in bill_id or 'SJR' in bill_id: bill_type = ['joint resolution'] elif 'HCR' in bill_id or 'SCR' in bill_id: bill_type = ['concurrent resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(hist_url) # get pieces of version_link vpieces = sidebar.xpath('//a[contains(string(.), "HTML")]/@href') if vpieces: version_base, version_type, version_end = vpieces[0].rsplit('/', 2) versions = [ o.strip() for o in sidebar.xpath("//select[@name='BVer']/option/text()") ] # if there are no options, put version_type in one if not versions: versions = [version_type] for version_name in versions: version_url = '/'.join( (version_base, version_name, version_end)) bill.add_version(version_name, version_url, mimetype='text/html') else: bill.add_version( 'Introduced', sidebar.xpath('//a[contains(string(.), "PDF")]/@href')[0], mimetype='application/pdf') sponsors = page.xpath("string(//table[2]/tr[3])").strip() sponsor_re = r'[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)' for sponsor in re.findall(sponsor_re, sponsors): sponsor = sponsor.replace(' and', '').strip(' .,') # a few sponsors get mangled by our regex sponsor = { 'Means': 'Ways & Means', 'Iowa': 'Economic Growth/Rebuild Iowa', 'Safety': 'Public Safety', 'Resources': 'Human Resources', 'Affairs': 'Veterans Affairs', 'Protection': 'Environmental Protection', 'Government': 'State Government', 'Boef': 'De Boef' }.get(sponsor, sponsor) bill.add_sponsor('primary', sponsor) for tr in page.xpath("//table[3]/tr"): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[2])").strip() action = re.sub(r'\s+', ' ', action) # Capture any amendment links. version_urls = set(version['url'] for version in bill['versions']) if 'amendment' in action.lower(): for anchor in tr.xpath('td[2]/a'): if '-' in anchor.text: url = anchor.attrib['href'] if url not in version_urls: bill.add_version(anchor.text, url, mimetype='text/html') version_urls.add(url) if 'S.J.' in action or 'SCS' in action: actor = 'upper' elif 'H.J.' in action or 'HCS' in action: actor = 'lower' else: actor = "other" action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip() if action.startswith('Introduced'): atype = ['bill:introduced'] if ', referred to' in action: atype.append('committee:referred') elif action.startswith('Read first time'): atype = 'bill:reading:1' elif action.startswith('Referred to'): atype = 'committee:referred' elif action.startswith('Sent to Governor'): atype = 'governor:received' elif action.startswith('Reported Signed by Governor'): atype = 'governor:signed' elif action.startswith('Signed by Governor'): atype = 'governor:signed' elif action.startswith('Vetoed by Governor'): atype = 'governor:vetoed' elif action.startswith('Item veto'): atype = 'governor:vetoed:line-item' elif re.match(r'Passed (House|Senate)', action): atype = 'bill:passed' elif re.match(r'Amendment (S|H)-\d+ filed', action): atype = ['amendment:introduced'] if ', adopted' in action: atype.append('amendment:passed') elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted', action): atype = 'amendment:passed' elif re.match('Amendment (S|N)-\d+ lost', action): atype = 'amendment:failed' elif action.startswith('Resolution filed'): atype = 'bill:introduced' elif action.startswith('Resolution adopted'): atype = 'bill:passed' elif (action.startswith('Committee report') and action.endswith('passage.')): atype = 'committee:passed' elif action.startswith('Withdrawn'): atype = 'bill:withdrawn' else: atype = 'other' if action.strip() == "": continue bill.add_action(actor, action, date, type=atype) bill['subjects'] = self._subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, url, kw, re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'), re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'), re_digits=re.compile(r'\d{,5}'), actions_get_actor=actions.get_actor): bill = Bill(**kw) bill.add_source(url) #--------------------------------------------------------------------- # A few helpers. _url_2_lxml = self._url_2_lxml _cleanup_sponsors = self._cleanup_sponsors # Shortcut function partial to get text at a particular xpath: doc = _url_2_lxml(url) _get_text = partial(get_text, doc, 0) # Get session number--needed for fetching related documents (see below). xpath = '//font[contains(., "General Assembly") and @face="Arial"]' session_num = doc.xpath(xpath)[0].text_content() session_num = re_digits.match(session_num).group() #--------------------------------------------------------------------- # Sponsors chamber = bill['chamber'] sponsor_types = { 'Additional Sponsor(s):': 'cosponsor', 'CoSponsors:': 'cosponsor', 'Primary Sponsor:': 'primary' } xpath = '//font[contains(., "Sponsor") and @color="#008080"]' headings = doc.xpath(xpath + '/text()') sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()') for h, s in zip(headings, sponsors): names = _cleanup_sponsors(s, chamber) type_ = sponsor_types[h.strip()] if names: for name, _chamber in names: bill.add_sponsor(type_, name, chamber=_chamber) #--------------------------------------------------------------------- # Versions tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/vwLegislation', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=url, docname="introduced", filename="Legis", tmp=tmp, session_num=session_num) for d in documents: bill.add_version(**d) # If bill is a substitution, add the original as a version. names = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]/text()') urls = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]' '/following-sibling::a/@href') for name, url in zip(names, urls): name = re_substitution.match(name).group(1) bill.add_version(name, url, description='original bill') #--------------------------------------------------------------------- # Actions actions = doc.xpath('//font[contains(., "Actions History")]' '/../following-sibling::table/descendant::td[2]') actions = actions[0].text_content() actions = filter(None, actions.splitlines()) for a in reversed(actions): date, action = a.split(' - ', 1) try: date = datetime.strptime(date, '%b %d, %Y') except ValueError: date = datetime.strptime(date, '%B %d, %Y') # XXX: ugh. actor = actions_get_actor(action, bill['chamber']) attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) bill.add_action(**attrs) #--------------------------------------------------------------------- # Votes xpaths = [ '//*[contains(text(), "vote:")]/following-sibling::a/@href', '//font[contains(., "vote:")]/a/@href' ] for xpath in xpaths: vote_urls = doc.xpath(xpath) if vote_urls: break for url in vote_urls: vote = self.scrape_vote(url) if vote: bill.add_vote(vote) #--------------------------------------------------------------------- # Amendments xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a") tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/' 'vwLegislation/{id_}/$file/{filename}{format_}?open') for source, id_ in zip(doc.xpath(xpath + '/@href'), doc.xpath(xpath + '/text()')): match = re_amendment.match(id_) if match is None: match = re.search('/?([A-Z]A \\d{1,3}) to', id_) short_id = match.group(1) documents = self.scrape_documents(source=source, docname='amendment (%s)' % short_id, filename='Legis', tmp=tmp, session_num=session_num, id_=id_) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Add any related "Engrossments". # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for # an explanation of the engrossment process in DE. source = doc.xpath('//img[@alt="Engrossment"]/../@href') if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/EngrossmentsforLookup', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Engrossment", filename="Engross", tmp=tmp, session_num=session_num, id_=bill['bill_id']) for d in documents: bill.add_version(**d) # -------------------------------------------------------------------- # Add any fiscal notes. source = doc.xpath("//img[@alt='Fiscal Note']/../@href") if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/FiscalforLookup', '{docnum}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Fiscal Note", filename="Fiscal", tmp=tmp, session_num=session_num) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Extra fields # Helper to get the first td sibling of certain nodes. tmp = '//font[contains(., "%s")]/../../../td[2]' first_sibling_text = lambda heading: _get_text(tmp % heading) extra_fields = { # A long description of the legislation. "summary": "Synopsis", # Codification details for enacted legislation. "volume_chapter": "Volume Chapter", # Presumably the date of approval/veto. "date_governor_acted": "Date Governor Acted", "fiscal_notes": "Fiscal Notes", } for key, name in extra_fields.iteritems(): try: bill[key] = first_sibling_text(name) except IndexError: # xpath lookup failed. pass if bill['title'].strip() == "": if bill['bill_id'] != "HB 130" and bill['session'] != '147': raise Exception("bill title is empty") bill['title'] = bill['summary'] # This added to help hack around the page that's missing # the bill title. self.save_bill(bill)
def bill_info(self, bill_link, session, main_url, bill_page): bill_page = lxml.html.fromstring(bill_page) #basic info try: long_title = bill_page.xpath( '//div[@id="content_text"]/h2')[0].text.split() except IndexError: return None bill_id = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] if not title: self.error('no title, skipping %s', bill_id) return #bill_type bill_type = 'resolution' if 'LR' in bill_id else 'bill' bill = Bill(session, 'upper', bill_id, title, type=bill_type) #sources bill.add_source(main_url) bill.add_source(bill_link) #Sponsor introduced_by = bill_page.xpath( '//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text bill.add_sponsor('primary', introduced_by) #actions for actions in bill_page.xpath( '//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'): date = actions[0].text if 'Date' not in date: date = datetime.strptime(date, '%b %d, %Y') action = actions[1].text_content() if 'Governor' in action: actor = 'Governor' elif 'Speaker' in action: actor = 'Speaker' else: actor = 'upper' action_type = self.action_types(action) bill.add_action(actor, action, date, action_type) # were in reverse chronological order bill['actions'].reverse() #versions for versions in bill_page.xpath( '//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'): version_url = versions.attrib['href'] version_url = 'http://nebraskalegislature.gov/' + version_url[ 3:len(version_url)] version_name = versions.text # replace Current w/ session number version_url = version_url.replace('Current', session) bill.add_version(version_name, version_url, mimetype='application/pdf') #documents # this appear to be same as versions, dropped for now #for additional_info in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td/a'): # document_name = additional_info.text # document_url = additional_info.attrib['href'] # document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)] # if '.pdf' in document_url: # bill.add_document(document_name, document_url) #amendments for admendments in bill_page.xpath( '//div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a' ): admendment_name = admendments.text admendment_url = admendments.attrib['href'] admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[ 3:len(admendment_url)] bill.add_document(admendment_name, admendment_url) #related transcripts for transcripts in bill_page.xpath( '//div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'): transcript_name = transcripts.text transcript_url = transcripts.attrib['href'] bill.add_document(transcript_name, transcript_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill['subjects'] = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title #bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsor('primary', primary) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search('(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsor('cosponsor', name) else: bill.add_sponsor('cosponsor', name) for link in page.xpath("//a[contains(@href, 'votes/house')]"): self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): self.scrape_senate_vote(bill, href, date) attrs = dict(actor=chamber, action=action, date=date) attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) self.save_bill(bill)
def scrape(self, session, chambers): api_base_url = "https://api.iga.in.gov" proxy = {"url":"http://in.proxy.openstates.org"} #ah, indiana. it's really, really hard to find #pdfs in their web interface. Super easy with #the api, but a key needs to be passed #in the headers. To make these documents #viewable to the public and our scrapers, #sunlight's put up a proxy service at this link #using our api key for pdf document access. client = ApiClient(self) r = client.get("bills",session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] for idx,char in enumerate(bill_id): try: int(char) except ValueError: continue disp_bill_id = bill_id[:idx]+" "+str(int(bill_id[idx:])) break bill_link = b["link"] api_source = api_base_url + bill_link bill_json = client.get("bill",session=session,bill_id=bill_id.lower()) title = bill_json["title"] if title == "NoneNone": title = None #sometimes title is blank #if that's the case, we can check to see if #the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] #and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") original_chamber = "lower" if bill_json["originChamber"].lower() == "house" else "upper" bill = Bill(session,original_chamber,disp_bill_id,title) bill.add_source(self.make_html_source(session,bill_id)) bill.add_source(api_source) #sponsors positions = {"Representative":"lower","Senator":"upper"} for s in bill_json["authors"]: bill.add_sponsor("primary", self.get_name(s), chamber=positions[s["position_title"]], official_type="author") for s in bill_json["coauthors"]: bill.add_sponsor("cosponsor", self.get_name(s), chamber=positions[s["position_title"]], official_type="coauthor") for s in bill_json["sponsors"]: bill.add_sponsor("primary", self.get_name(s), chamber=positions[s["position_title"]], official_type="sponsor") for s in bill_json["cosponsors"]: bill.add_sponsor("cosponsor", self.get_name(s), chamber=positions[s["position_title"]], official_type="cosponsor") #actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get("bill_actions",session=session,bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items":[]} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue date = datetime.datetime.strptime(date,"%Y-%m-%dT%H:%M:%S") action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("bill:reading:1") reading = True if ("second reading" in d or "reread second time" in d): action_type.append("bill:reading:2") reading = True if ("third reading" in d or "reread third time" in d): action_type.append("bill:reading:3") if "passed" in d: action_type.append("bill:passed") if "failed" in d: action_type.append("bill:failed") reading = True if "adopted" in d and reading: action_type.append("bill:passed") if ("referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d): committee = d.split("committee on")[-1].strip() action_type.append("committee:referred") if "committee report" in d: if "pass" in d: action_type.append("committee:passed") if "fail" in d: action_type.append("committee:failed") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment:passed") if "fail" or "out of order" in d: action_type.append("amendment:failed") if "withdraw" in d: action_type.append("amendment:withdrawn") if "signed by the governor" in d: action_type.append("governor:signed") if ("not substituted for majority report" in d or "returned to the house" in d or "referred to the senate" in d or "referred to the house" in d or "technical corrections" in d or "signed by the president" in d or "signed by the speaker" or "authored" in d or "sponsor" in d or "coauthor" in d or ("rule" in d and "suspended" in d) or "removed as author" in d or ("added as" in d and "author" in d) or "public law" in d): if len(action_type) == 0: action_type.append("other") if len(action_type) == 0: #calling it other and moving on with a warning self.logger.warning("Could not recognize an action in '{}'".format(action_desc)) action_type = ["other"] elif committee: bill.add_action(action_chamber,action_desc,date,type=action_type,committees=committee) else: bill.add_action(action_chamber,action_desc,date,type=action_type) #subjects subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] bill["subjects"] = subjects #versions and votes for version in bill_json["versions"][::-1]: version_json = client.get("bill_version", session=session, bill_id=version["billName"], version_id=version["printVersionName"]) self.deal_with_version(version_json,bill,proxy) self.save_bill(bill)
def bill_info(self, bill_link, session, main_url, bill_page): bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(bill_link) #basic info try: long_title = bill_page.xpath('//div[@id="content_text"]/h2')[0].text.split() except IndexError: return None bill_id = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] if not title: self.error('no title, skipping %s', bill_id) return #bill_type bill_type = 'resolution' if 'LR' in bill_id else 'bill' bill = Bill(session, 'upper', bill_id, title, type = bill_type) #sources bill.add_source(main_url) bill.add_source(bill_link) #Sponsor introduced_by = bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text bill.add_sponsor('primary', introduced_by) #actions for actions in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'): date = actions[0].text if 'Date' not in date: date = datetime.strptime(date, '%b %d, %Y') action = actions[1].text_content() if 'Governor' in action: actor = 'Governor' elif 'Speaker' in action: actor = 'Speaker' else: actor = 'upper' action_type = self.action_types(action) bill.add_action(actor, action, date, action_type) # were in reverse chronological order bill['actions'].reverse() #versions for version in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'): version_url = version.attrib['href'] version_name = version.text # replace Current w/ session number version_url = version_url.replace('Current', session) bill.add_version(version_name, version_url, mimetype='application/pdf') #amendments for amendment in bill_page.xpath('//h2[text()="Amendments"]/following-sibling::table[1]//a'): amendment_name = amendment.text amendment_url = amendment.attrib['href'] bill.add_document(amendment_name, amendment_url) #related transcripts for transcripts in bill_page.xpath('//h2[text()="Related Transcripts"]/following-sibling::table[1]//a'): transcript_name = transcripts.text transcript_url = transcripts.attrib['href'] bill.add_document(transcript_name, transcript_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.urlopen(url) page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title #bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values['LEAD SPONSOR:']) if primary: bill.add_sponsor('primary', primary) # Add cosponsors. sponsors = strip_sponsors('', values['SPONSORS:']).split('\r\n') for name in sponsors: name = name.strip(', ') if name: bill.add_sponsor('cosponsor', name) for link in page.xpath("//a[contains(@href, 'votes/house')]"): self.scrape_vote(bill, link.attrib['href']) actor = chamber for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if (action == 'Communicated to Senate' or action.startswith('Senate received') or action.startswith('Ordered to Senate')): actor = 'upper' elif (action == 'Communicated to House' or action.startswith('House received') or action.startswith('Ordered to House')): actor = 'lower' if action == 'Read 1st time': atype = 'bill:reading:1' elif action == 'Read 2nd time': atype = 'bill:reading:2' elif action == 'Read 3rd time': atype = 'bill:reading:3' elif action == 'Filed for introduction': atype = 'bill:filed' elif action.startswith('To Governor') and 'Journal' not in action: atype = 'governor:received' elif re.match(r'To [A-Z]', action): atype = 'committee:referred' elif action.startswith('Introduced in'): atype = 'bill:introduced' elif (action.startswith('Approved by Governor') and 'Journal' not in action): atype = 'governor:signed' elif (action.startswith('Passed Senate') or action.startswith('Passed House')): atype = 'bill:passed' elif (action.startswith('Reported do pass') or action.startswith('With amendment, do pass')): atype = 'committee:passed' else: atype = 'other' bill.add_action(actor, action, date, type=atype) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(session, chamber, bill_id, title, type=btype) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version(link.xpath('string()').strip(), link.attrib['href'], mimetype='text/html') sponsor_links = page.xpath("//td[contains(@id, 'tdSponsors')]/a") for link in sponsor_links: bill.add_sponsor("primary", link.text) actor = chamber use_row = False self.debug(bill_id) for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"): if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('bill:introduced') atypes.append('bill:reading:1') elif action.startswith('Signed by Governor'): atypes.append('governor:signed') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = 'bill' else: first = 'committee' atypes.append("%s:%s" % (first, match.group(3).lower())) if 'referred to' in action.lower(): atypes.append('committee:referred') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment:introduced') atypes.append('amendment:passed') if 'Veto override, Passed' in action: atypes.append('bill:veto_override:passed') elif 'Veto override, Failed' in action: atypes.append('bill:veto_override:failed') if 'Delivered to the Governor' in action: atypes.append('governor:received') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match('\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(actor, action, date, type=atypes) subjects = [] for link in page.xpath("//a[contains(@href, 'Keyword')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, title, sponsor, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) bill = Bill(session, chamber, bill_id, title) bill.add_source(url) bill.add_sponsor('introducer', sponsor) try: hist_table = page.xpath( "//div[@id = 'tabBodyBillHistory']/table")[0] for tr in hist_table.xpath("tbody/tr"): date = tr.xpath("string(td[1])") date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[2])") actor = { 'Senate': 'upper', 'House': 'lower' }.get(actor, actor) act_text = tr.xpath("string(td[3])").strip() for action in act_text.split(u'\u2022'): action = action.strip() if not action: continue atype = [] if action.startswith('Referred to'): atype.append('committee:referred') elif action.startswith('Favorable by'): atype.append('committee:passed') elif action == "Filed": atype.append("bill:filed") elif action.startswith("Withdrawn"): atype.append("bill:failed") bill.add_action(actor, action, date, type=atype) except IndexError: self.log("No bill history for %s" % bill_id) try: version_table = page.xpath( "//div[@id = 'tabBodyBillText']/table")[0] for tr in version_table.xpath("tbody/tr"): name = tr.xpath("string(td[1])").strip() url = tr.xpath("td/a[1]")[0].attrib['href'] bill.add_version(name, url) except IndexError: self.log("No version table for %s" % bill_id) try: analysis_table = page.xpath( "//div[@id = 'tabBodyStaffAnalysis']/table")[0] for tr in analysis_table.xpath("tbody/tr"): name = tr.xpath("string(td[1])").strip() name += " -- " + tr.xpath("string(td[3])").strip() date = tr.xpath("string(td[4])").strip() if date: name += " (%s)" % date url = tr.xpath("td/a")[0].attrib['href'] bill.add_document(name, url) except IndexError: self.log("No analysis table for %s" % bill_id) self.save_bill(bill)
def scrape_bill_type(self, chamber, session, bill_type, type_abbr): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # Construct session for web query, going from '20092010' to '0910' source_session = session[2:4] + session[6:8] # Turn 'AB 10' into 'ab_10' source_num = "%s_%s" % (bill.measure_type.lower(), bill.measure_num) # Construct a fake source url source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?" "bill_number=%s&sess=%s" % (source_num, source_session)) fsbill.add_source(source_url) scraped_versions = self.scrape_site_versions(bill, source_url) title = '' short_title = '' type = ['bill'] subject = '' all_titles = set() i = 0 for version in bill.versions: if not version.bill_xml: continue title = clean_title(version.title) all_titles.add(title) short_title = clean_title(version.short_title) type = [bill_type] if version.appropriation == 'Yes': type.append('appropriation') if version.fiscal_committee == 'Yes': type.append('fiscal committee') if version.local_program == 'Yes': type.append('local program') if version.urgency == 'Yes': type.append('urgency') if version.taxlevy == 'Yes': type.append('tax levy') if version.subject: subject = clean_title(version.subject) date = version.bill_version_action_date.date() url = '' try: scraped_version = scraped_versions[i] if scraped_version[0] == date: url = scraped_version[1] i += 1 except IndexError: pass fsbill.add_version( version.bill_version_id, url, date=date, title=title, short_title=short_title, subject=[subject], type=type) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['short_title'] = short_title fsbill['type'] = type fsbill['subjects'] = [subject] # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) introduced = False for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) if act_str.startswith('Introduced'): introduced = True type.append('bill:introduced') if 'Read first time.' in act_str: if not introduced: type.append('bill:introduced') introduced = True type.append('bill:reading:1') if 'To Com' in act_str or 'referred to' in act_str.lower(): type.append('committee:referred') if 'Read third time. Passed.' in act_str: type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('governor:signed') if 'Item veto' in act_str: type.append('governor:vetoed:line-item') if 'Vetoed by Governor' in act_str: type.append('governor:vetoed') if 'To Governor' in act_str: type.append('governor:received') if 'Read second time' in act_str: type.append('bill:reading:2') if not type: type = ['other'] fsbill.add_action(actor, act_str, action.action_date.date(), type=type) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) # The abstain count field in CA's database includes # vacancies, which we aren't interested in. fsvote['other_count'] = len(fsvote['other_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape(self, chamber, session): if int(session) < 128: raise NoDataForPeriod(session) base_url = 'http://www.lsc.state.oh.us/status%s/' % session bill_types = {'lower': [('hb','bill'), ('hjr','joint resolution'), ('hcr','concurrent resolution')], 'upper': [('sb','bill'), ('sjr','joint resolution'), ('scr','concurrent resolution')]} for bill_prefix, bill_type in bill_types[chamber]: url = base_url + '%s.xlsx' % bill_prefix try: fname, resp = self.urlretrieve(url) except scrapelib.HTTPError: # if there haven't yet been any bills of a given type # then the excel url for that type will 404 continue sh = xlrd.open_workbook(fname).sheet_by_index(0) # once workbook is open, we can remove tempfile os.remove(fname) for rownum in range(1, sh.nrows): bill_id = '%s %s' % (bill_prefix.upper(), rownum) bill_title = str(sh.cell(rownum, 3).value) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(url) bill.add_sponsor('primary', str(sh.cell(rownum, 1).value)) # add cosponsor if sh.cell(rownum, 2).value: bill.add_sponsor('cosponsor', str(sh.cell(rownum, 2).value)) actor = "" # Actions start column after bill title for colnum in range(4, sh.ncols - 1): action = str(sh.cell(0, colnum).value) cell = sh.cell(rownum, colnum) date = cell.value if len(action) != 0: if action.split()[0] == 'House': actor = "lower" elif action.split()[0] == 'Senate': actor = "upper" elif action.split()[-1] == 'Governor': actor = "executive" elif action.split()[0] == 'Gov.': actor = "executive" elif action.split()[-1] == 'Gov.': actor = "executive" if action in ('House Intro. Date', 'Senate Intro. Date'): atype = ['bill:introduced'] action = action.replace('Intro. Date', 'Introduced') elif action == '3rd Consideration': atype = ['bill:reading:3', 'bill:passed'] elif action == 'Sent to Gov.': atype = ['governor:received'] elif action == 'Signed By Governor': atype = ['governor:signed'] else: atype = ['other'] if type(date) == float: date = str(xlrd.xldate_as_tuple(date, 0)) date = datetime.datetime.strptime( date, "(%Y, %m, %d, %H, %M, %S)") bill.add_action(actor, action, date, type=atype) self.scrape_votes(bill, bill_prefix, rownum, session) self.scrape_versions(bill, bill_prefix, rownum, session) self.save_bill(bill)
def scrape_details(self, bill_detail_url, session, chamber, bill_id): page = self.urlopen(bill_detail_url) if 'INVALID BILL NUMBER' in page: self.warning('INVALID BILL %s' % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath('span/text()')[0] if 'General Bill' in bill_type: bill_type = 'bill' elif 'Concurrent Resolution' in bill_type: bill_type = 'concurrent resolution' elif 'Joint Resolution' in bill_type: bill_type = 'joint resolution' elif 'Resolution' in bill_type: bill_type = 'resolution' else: raise ValueError('unknown bill type: %s' % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill(session, chamber, bill_id, bill_summary, type=bill_type) bill['subjects'] = list(self._subjects[bill_id]) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsor('primary', sponsor) for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace(u'\xa0', ' ').strip() bill.add_sponsor('primary', sponsor) # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.urlopen(version_url) version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version(version.text, version.get('href'), on_duplicate='use_old', mimetype='text/html') # actions for row in bill_div.xpath('table/tr'): date_td, chamber_td, action_td = row.xpath('td') date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = {'Senate':'upper', 'House':'lower', None: 'other'}[chamber_td.text] action = action_td.text_content() action = action.split('(House Journal')[0] action = action.split('(Senate Journal')[0].strip() atype = action_type(action) bill.add_action(action_chamber, action, date, atype) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) self.save_bill(bill)
def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''): """bills from 2008 and below are in a 'pre' element and is simpler to parse them as text""" url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % ( session, bill_id.replace(' ', '')) bill_page = self.urlopen(url) html = lxml.html.fromstring(bill_page) text = html.xpath('//pre')[0].text.split('\r\n') # title title = " - ".join( [x.strip() for x in text[1].split('-') if x.isupper()]) # bill type bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) # sponsors sponsors = text[0].split('by')[-1] for sponsor in sponsors.split(','): bill.add_sponsor('primary', sponsor) actor = chamber self.flag() # clear last bills vote flags self.vote = None # for line in text: if re.match(r'^\d\d/\d\d', line): date = date = datetime.datetime.strptime( line[0:5] + '/' + session[0:4], "%m/%d/%Y") self.last_date = date action_text = line[5:].strip() # actor if action_text.lower().startswith('house') or \ action_text.lower().startswith('senate'): actor = {'H': 'lower', 'S': 'upper'}[action_text[0]] action = get_action(actor, action_text) bill.add_action(actor, action_text, date, type=action) if "bill:passed" in action or "bill:failed" in action: passed = False if 'FAILED' in action_text else True votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text) if votes: yes, no, other = votes.groups() self.in_vote = True self.vote = Vote(chamber, date, action_text, passed, int(yes), int(no), int(other)) else: date = self.last_date # nothing to do if its not a vote if "Floor Sponsor" in line: self.in_vote = False if self.vote: bill.add_vote(self.vote) self.vote = None if not self.in_vote: continue if 'AYES --' in line: self.flag(ayes=True) elif 'NAYS --' in line: self.flag(nays=True) elif 'Absent and excused' in line: self.flag(other=True) if self.ayes: for name in line.replace('AYES --', '').split(','): name = name.strip() if name: self.vote.yes(name) if self.nays: for name in line.replace('NAYS --', '').split(','): name = name.strip() if name: self.vote.no(name) if self.other: for name in line.replace('Absent and excused --', '').split(','): name = name.strip() if name: self.vote.other(name) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.urlopen(url) page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title #bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values['LEAD SPONSOR:']) if primary: bill.add_sponsor('primary', primary) # Add cosponsors. sponsors = strip_sponsors('', values['SPONSORS:']).split('\r\n') for name in sponsors: name = name.strip(', ') if name: bill.add_sponsor('cosponsor', name) for link in page.xpath("//a[contains(@href, 'votes/house')]"): self.scrape_vote(bill, link.attrib['href']) actor = chamber for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() attrs = dict(actor=actor, action=action, date=date) attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): sidebar = lxml.html.fromstring(self.urlopen(url)) try: hist_url = get_popup_url( sidebar.xpath("//a[contains(., 'Bill History')]")[0]) except IndexError: # where is it? return page = lxml.html.fromstring(self.urlopen(hist_url)) page.make_links_absolute(hist_url) title = page.xpath("string(//table[2]/tr[4])").strip() if 'HR' in bill_id or 'SR' in bill_id: bill_type = ['resolution'] elif 'HJR' in bill_id or 'SJR' in bill_id: bill_type = ['joint resolution'] elif 'HCR' in bill_id or 'SCR' in bill_id: bill_type = ['concurrent resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(hist_url) for option in sidebar.xpath("//select[@name='BVer']/option"): version_name = option.text.strip() if option.get('selected'): version_url = re.sub(r'frm=2', 'frm=1', url) else: version_url = option.attrib['value'] bill.add_version(version_name, version_url) if not bill['versions']: version_url = re.sub(r'frm=2', 'frm=3', url) bill.add_version('Introduced', version_url) sponsors = page.xpath("string(//table[2]/tr[3])").strip() sponsor_re = r'[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)' for sponsor in re.findall(sponsor_re, sponsors): sponsor = sponsor.replace(' and', '').strip(' .,') # a few sponsors get mangled by our regex sponsor = { 'Means': 'Ways & Means', 'Safety': 'Public Safety', 'Resources': 'Human Resources', 'Affairs': 'Veterans Affairs', 'Protection': 'Environmental Protection', 'Government': 'State Government', 'Boef': 'De Boef' }.get(sponsor, sponsor) bill.add_sponsor('sponsor', sponsor) for tr in page.xpath("//table[3]/tr"): date = tr.xpath("string(td[1])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[2])").strip() action = re.sub(r'\s+', ' ', action) if 'S.J.' in action or 'SCS' in action: actor = 'upper' elif 'H.J.' in action or 'HCS' in action: actor = 'lower' action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip() if action.startswith('Introduced'): atype = ['bill:introduced'] if ', referred to' in action: atype.append('committee:referred') elif action.startswith('Read first time'): atype = 'bill:reading:1' elif action.startswith('Referred to'): atype = 'committee:referred' elif action.startswith('Sent to Governor'): atype = 'governor:received' elif action.startswith('Signed by Governor'): atype = 'governor:signed' elif action.startswith('Vetoed by Governor'): atype = 'governor:vetoed' elif action.startswith('Item veto'): atype = 'governor:vetoed:line-item' elif re.match(r'Passed (House|Senate)', action): atype = 'bill:passed' elif re.match(r'Amendment (S|H)-\d+ filed', action): atype = ['amendment:introduced'] if ', adopted' in action: atype.append('amendment:passed') elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted', action): atype = 'amendment:passed' elif re.match('Amendment (S|N)-\d+ lost', action): atype = 'amendment:failed' elif action.startswith('Resolution filed'): atype = 'bill:introduced' elif action.startswith('Resolution adopted'): atype = 'bill:passed' elif (action.startswith('Committee report') and action.endswith('passage.')): atype = 'committee:passed' elif action.startswith('Withdrawn'): atype = 'bill:withdrawn' else: atype = 'other' bill.add_action(actor, action, date, type=atype) bill['subjects'] = self._subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text.encode('ascii', 'ignore') root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if (bill_title is None or "Bill does not exist" in history_xml): self.warning("Bill does not appear to exist") return bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(history_url) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version(name=self.NAME_SLUGS[version[1][-5]], url=version[1], mimetype='text/html') analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document(name="Analysis ({})".format( self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], mimetype='text/html') fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document(name="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], mimetype='text/html') witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document(name="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], mimetype='text/html') for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': self.warning("Skipping public hearing action with no date") continue introduced = False if desc == 'Amended': atype = 'amendment:passed' elif desc == 'Amendment(s) offered': atype = 'amendment:introduced' elif desc == 'Amendment amended': atype = 'amendment:amended' elif desc == 'Amendment withdrawn': atype = 'amendment:withdrawn' elif desc == 'Passed' or desc == 'Adopted': atype = 'bill:passed' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'bill:introduced' else: atype = 'bill:filed' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'governor:received' elif desc.startswith('Signed by the Governor'): atype = 'governor:signed' elif desc == 'Vetoed by the Governor': atype = 'governor:vetoed' elif desc == 'Read first time': atype = ['bill:introduced', 'bill:reading:1'] introduced = True elif desc == 'Read & adopted': atype = ['bill:passed'] if not introduced: introduced = True atype.append('bill:introduced') elif desc == "Passed as amended": atype = 'bill:passed' elif (desc.startswith('Referred to') or desc.startswith("Recommended to be sent to ")): atype = 'committee:referred' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee:passed' elif desc == "Filed": atype = 'bill:filed' elif desc == 'Read 3rd time': atype = 'bill:reading:3' elif desc == 'Read 2nd time': atype = 'bill:reading:2' elif desc.startswith('Reported favorably'): atype = 'committee:passed:favorable' else: atype = 'other' if 'committee:referred' in atype: repls = ['Referred to', "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() extra['committees'] = ctty bill.add_action(actor, action.findtext('description'), act_date, type=atype, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('primary', author, official_type='author') for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('cosponsor', coauthor, official_type='coauthor') for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('primary', sponsor, official_type='sponsor') for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor, official_type='cosponsor') self.save_bill(bill)
def scrape(self, chamber, session): # check for abiword if os.system('which abiword') != 0: raise ScrapeError('abiword is required for KS scraping') chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.urlopen(ksapi.url + 'bill_status/') bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill(session, chamber, bill_id, title, type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill['title']): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) try: self.scrape_html(bill) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) self.save_bill(bill)
def scrape_bill_page(self, chamber, session, bill_url, bill_type): page = self.lxmlize(bill_url) author = self.get_one_xpath( page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()" ) sbp = lambda x: self.scrape_bare_page(page.xpath( "//a[contains(text(), '%s')]" % (x))[0].attrib['href']) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] title = page.xpath( "//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] actions = page.xpath( "//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr") bill_id = page.xpath( "//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(bill_url) authors.remove(author) bill.add_sponsor('primary', author) for author in authors: bill.add_sponsor('cosponsor', author) for digest in digests: bill.add_document(digest.text, digest.attrib['href'], mimetype="application/pdf") for version in versions: bill.add_version(version.text, version.attrib['href'], mimetype="application/pdf") flags = { "prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"], "sent to the house": ['bill:passed'], "ordered to the senate": ['bill:passed'], } try: votes_link = page.xpath("//a[text() = 'Votes']")[0] self.scrape_votes(bill, votes_link.attrib['href']) except IndexError: # Some bills don't have any votes pass for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] date += "/%s" % (session) # Session is April --> June. Prefiles # look like they're in January at earliest. date = dt.datetime.strptime(date, "%m/%d/%Y") chamber = {"S": "upper", "H": "lower", "J": 'joint'}[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] if cat == []: cat = ["other"] bill.add_action(chamber, text, date, cat) self.save_bill(bill)