def scrape_bill(self, chamber, session, bill_id, url): try: page = self.urlopen(url) except scrapelib.HTTPError: self.warning("couldn't open %s, skipping bill" % url) return page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href'], mimetype="text/html") next = link.getnext() if next.text == "PDF": bill.add_version(name, next.attrib['href'], mimetype="application/pdf") for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill['subjects'] = self._subject_mapping[bill_id] bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath( '//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip()) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type']) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath( '//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href')) bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) # split "SB1 SD2 HD2" to get SB1 bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0] title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ') subjects = [s.strip() for s in subjects if s.strip()] description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text referral = page.xpath('//span[contains(@id, "referral")]/text()')[0] bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type, description=description, referral=referral) for sponsor in sponsors.split(', '): if sponsor.endswith(' (BR)'): sponsor = sponsor[:-5] bill.add_sponsor('primary', sponsor) # actions actions = [] table = page.xpath('//table[@id="GridViewStatus"]')[0] for row in table.xpath('tr'): action_params = {} cells = row.xpath('td') if len(cells) == 3: ch = cells[1].xpath('font')[0].text action_params['actor'] = house[ch] action_params['action'] = cells[2].xpath('font')[0].text action_date = cells[0].xpath('font')[0].text action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y") action_params['type'] = categorize_action(action_params['action']) actions.append(action_params) for action_params in actions: bill.add_action(**action_params) self.parse_vote(bill, action_params['action'], action_params['actor'], action_params['date']) # add versions try: for version in page.xpath('//a[contains(@id, "StatusLink")]'): bill.add_version(version.text.replace('_', ' '), version.get('href')) except IndexError: # href not found. pass bill.add_source(bill_url) self.save_bill(bill)
def parse_senate_billpage(self, bill_url, year): bill_page = self.urlopen(bill_url) bill_page = lxml.html.fromstring(bill_page) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() #print "bill id = "+ bill_id bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self.subjects: subs = self.subjects[bid] self.log("With subjects for this bill") self.log(bid) bill = Bill(year, 'upper', bill_id, bill_desc, bill_lr=bill_lr, type=bill_type, subjects=subs) bill.add_source(bill_url) # Get the primary sponsor sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'): self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//*[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] #print "actions = %s" % action_url self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//*[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'): self.parse_senate_bill_versions(bill, versions_url[0].attrib['href']) self.save_bill(bill)
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = lxml.html.fromstring(bill_page) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath( '//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() #print "bill id = "+ bill_id bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key( 'href'): self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//*[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] #print "actions = %s" % action_url self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//*[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.has_key( 'href'): self.parse_senate_bill_versions(bill, versions_url[0].attrib['href']) self.save_bill(bill)
def test_on_duplicate(): b = Bill('S1', 'upper', 'SB1', 'on_duplicate') b.add_version('current', 'http://example.com/doc/1', mimetype='text/html') # error with assert_raises(ValueError): b.add_version('current', 'http://example.com/doc/1', mimetype='text/html', on_duplicate='error') # or without it set, default to error with assert_raises(ValueError): b.add_version('current', 'http://example.com/doc/1', mimetype='text/html') # use_old - keep version name the same b.add_version('updated name', 'http://example.com/doc/1', mimetype='text/html', on_duplicate='use_old') assert_equal(b['versions'], [{ 'mimetype': 'text/html', 'url': 'http://example.com/doc/1', 'name': 'current' }]) # use_new - keep version name the same b.add_version('updated name', 'http://example.com/doc/1', mimetype='text/html', on_duplicate='use_new') assert_equal(b['versions'], [{ 'mimetype': 'text/html', 'url': 'http://example.com/doc/1', 'name': 'updated name' }]) # a new document w/ same name is ok though b.add_version('updated name', 'http://example.com/doc/2', mimetype='text/html', on_duplicate='use_old') assert len(b['versions']) == 2 # and now we add a duplicate b.add_version('current', 'http://example.com/doc/1', mimetype='text/html', on_duplicate='ignore') assert len(b['versions']) == 3
def scrape_bill(self, bill_page_url): bill_page = lxml.html.fromstring(self.get(bill_page_url).text) title = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()') if title: title = title[0] else: self.warning('Missing bill title {}'.format(bill_page_url)) return False bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()') if bill_no: bill_no = bill_no[0] else: bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()' ) if bill_no: bill_no = bill_no[0] else: self.error('Missing bill number {}'.format(bill_page_url)) return False bill = Bill(session=self.session, chamber='upper', bill_id=bill_no, title=title, type='bill') bill.add_source(bill_page_url) self.parse_versions(bill, bill_page, bill_no) self.parse_acts(bill, bill_page) sponsors = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()') if sponsors: self.assign_sponsors(bill, sponsors[0], 'primary') cosponsors = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()') if cosponsors: self.assign_sponsors(bill, cosponsors[0], 'cosponsor') self.parse_date_actions(bill, bill_page) self.parse_actions(bill, bill_page) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): bill.add_sponsor('primary', self.clean_name(aname).strip()) co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()); action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype,action = self.parse_action(chamber,bill,action,action_url,date) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern,action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url[0], action,date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url[0]) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, short_title, url): if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']: return with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: title = short_title title = title.strip() abbrev = bill_id.split()[0] if abbrev.endswith('B'): bill_type = ['bill'] elif abbrev.endswith('JR'): bill_type = ['joint resolution'] elif abbrev.endswith('CR'): bill_type = ['concurrent resolution'] elif abbrev.endswith('R'): bill_type = ['resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"): self.scrape_senate_vote(bill, vote_link.attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"): self.scrape_house_vote(bill, vote_link.attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) page = self.urlopen(url) page = lxml.etree.fromstring(page.bytes) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])) bill.add_source(fake_source) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] mimetype = 'text/html' version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/" "Htm/Bills/%s %ss/%s.htm" % (biennium, chamber_name, bill_type.title(), bill_num)) # Sometimes the measure's version_url isn't guessable. When that happens # have to get the url from the source page. version_resp = self.get(version_url) if version_resp.status_code != 200: webpage = self.get(fake_source).text webdoc = lxml.html.fromstring(webpage) version_url = webdoc.xpath( '//a[contains(@href, "billdocs")]/@href')[-1] if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' bill.add_version(bill_id, version_url, mimetype=mimetype) self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_votes(bill) self.fix_prefiled_action_dates(bill) return bill
def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build()
def scrape(self, chamber, session): try: for index in xrange(1, 1000): url = ("http://open.nysenate.gov/legislation/search/" "?search=otype:bill&searchType=&format=xml" "&pageIdx=%d" % index) with self.urlopen(url) as page: page = lxml.etree.fromstring(page) for result in page.xpath("//result[@type = 'bill']"): bill_id = result.attrib['id'].split('-')[0] title = result.attrib['title'].strip() if title == '(no title)': continue primary_sponsor = result.attrib['sponsor'] primary_sponsor = re.sub(r'\s+\(MS\)\s*$', '', primary_sponsor).strip() bill_chamber, bill_type = { 'S': ('upper', 'bill'), 'R': ('upper', 'resolution'), 'J': ('upper', 'legislative resolution'), 'B': ('upper', 'concurrent resolution'), 'A': ('lower', 'bill'), 'E': ('lower', 'resolution'), 'K': ('lower', 'legislative resolution'), 'L': ('lower', 'joint resolution') }[bill_id[0]] if chamber != bill_chamber: continue bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill.add_sponsor('primary', primary_sponsor) bill_url = ("http://open.nysenate.gov/legislation/" "bill/%s" % result.attrib['id']) self.scrape_bill(bill, bill_url) bill.add_source(bill_url) self.save_bill(bill) except scrapelib.HTTPError as e: if e.response.code != 404: raise
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification'])) # TODO: related entities for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date'])) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date'])) for title in data['other_titles']: bill.add_title(title) # TODO: related bills # for related in data['related_bills']: self.save_bill(bill)
def scrape(self, chamber, session): self.user_agent = 'openstates +mozilla' # internal id for the session, store on self so all methods have access self.site_id = self.metadata['session_details'][session]['site_id'] self.build_subject_map() # used for skipping bills from opposite chamber start_letter = 'H' if chamber == 'lower' else 'S' url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id while url: with self.urlopen(url, retry_on_404=True) as html: doc = lxml.html.fromstring(html) url = None # no more unless we encounter 'More...' bills = doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) # check if this is the 'More...' link if bill_id.startswith('More'): url = BASE_URL + link.get('href') # skip bills from the other chamber elif not bill_id.startswith(start_letter): continue else: # create a bill desc = bill.xpath('text()')[0].strip() bill_type = { 'B': 'bill', 'J': 'joint resolution', 'R': 'resolution' }[bill_id[1]] bill = Bill(session, chamber, bill_id, desc, type=bill_type) bill_url = BASE_URL + link.get('href') self.fetch_sponsors(bill) self.scrape_bill_details(bill_url, bill) bill['subjects'] = self.subject_map[bill_id] bill.add_source(bill_url) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse bill_html = self.urlopen(bill_detail_url) doc = lxml.html.fromstring(bill_html) # Get the basic parts of the bill bill_id = doc.xpath('//h1/text()')[0] bill_title = doc.xpath('//h2/following-sibling::p/text()')[0].strip() bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) # Add source bill.add_source(bill_detail_url) # Add subjects. Currently we are not mapping to Open States # standardized subjects, so use 'scraped_subjects' bill['scraped_subjects'] = self._subject_mapping[bill_id] # Get companion bill. companion = doc.xpath( '//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()' ) companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) self.save_bill(bill)
def scrape_bill_status_page(self, url, params={}): """Scrapes the status page url, populating parameter dict and returns bill """ with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) params['bill_id'] = page.xpath( '//h3[contains(@class, "center")]/a')[0].text params['title'] = page.xpath('//div[div[contains( \ ., "Report Title")]]/div[contains(@class, "rightside")]' )[0].text.strip() sponsors = page.xpath('//div[div[contains( \ ., "Introducer")]]/div[contains(@class, "rightside")]')[0].text subject = page.xpath('//div[div[contains( \ ., "Measure Title")]]/div[contains(@class, "rightside")]' )[0].text.strip() subject = subject.replace('RELATING TO ', '') # Remove lead text params['subject'] = subject.replace('.', '') params['description'] = page.xpath('//div[div[contains( \ ., "Description")]]/div[contains(@class, "rightside")]' )[0].text params['companion'] = page.xpath('//div[div[contains( \ ., "Companion")]]/div[contains(@class, "rightside")]')[0].text if params['title'] == '': params['title'] = params['subject'] actions = [] table = page.xpath('//table[tr/th[contains(., "Date")]]')[0] for row in table.xpath('tr[td]'): # Ignore table header row # import pdb; pdb.set_trace() action_params = {} cells = row.xpath('td') if len(cells) == 3: ch = cells[1].text action_params['actor'] = house[ch] action_params['action'] = cells[2].text action_date = cells[0].text.split()[ 0] # Just get date, ignore any time. try: action_params['date'] = datetime.strptime( action_date, "%m/%d/%y") except ValueError: # Try a YYYY format. action_params['date'] = datetime.strptime( action_date, "%m/%d/%Y") actions.append(action_params) bill = Bill(**params) bill.add_sponsor('primary', sponsors) for action_params in actions: bill.add_action(**action_params) self.save_bill(bill) return bill
def all_scrape(self, chamber, session): url = ('ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt') file = self.urlopen(url).decode('UTF-8', 'ignore') count = 0 lines = file.split('\n') for item in lines: if item and count: item_chamber, type, bill_number, title, title_sub_1, title_sub_2, title_sub_3, title_sub_4, \ title_sub_5, title_sub_6, record_id, initial_sponsor, act_number, initial_date, action_date, \ unknown_legislator, bill_id, congressional_session = item.split('|') congressional_session = congressional_session.strip() if congressional_session == session and chamber == CHAMBERS[ item_chamber]: bill = Bill(session, chamber, bill_id, title, act_number=act_number) if initial_sponsor: bill.add_sponsor('primary', initial_sponsor) bill.add_source(url) self.save_bill(bill) base_url = 'http://www.arkleg.state.ar.us/assembly/' + congressional_session[: 4] + '/' + congressional_session + '/Pages/' # try: # html = self.urlopen(base_url + 'BillInformation.aspx?measureno=' + bill_id) # except: # pass try: html = self.urlopen( base_url + 'BillStatusHistory.aspx?measureno=' + bill_id) except: pass else: history = self.bill_history(bill, lxml.html.fromstring(html)) try: html = self.urlopen(base_url + 'CoSponsors.aspx?measureno=' + bill_id) except: pass else: add_sponsors = self.add_sponsors( bill, lxml.html.fromstring(html)) count += 1 return count
def parse_bill(self, session, chamber, line): (type, combined_id, number, title, relating_to) = line.split("\xe4") if ((type[0] == 'H' and chamber == 'lower') or (type[0] == 'S' and chamber == 'upper')): # basic bill info bill_id = "%s %s" % (type, number) # lookup type without chamber prefix bill_type = self.bill_types[type[1:]] self.all_bills[bill_id] = Bill(session, chamber, bill_id, title, type=bill_type)
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) scraped_bill_id = bill_page.xpath( "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content() bill_id = scraped_bill_id.split(' ')[0] versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] tables = bill_page.xpath("//table") metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta['Report Title'].split(";")] if "" in subs: subs.remove("") b = Bill(session, chamber, bill_id, title=meta['Measure Title'], summary=meta['Description'], referral=meta['Current Referral'], subjects=subs, type=bill_type) b.add_source(url) companion = meta['Companion'].strip() if companion: b['companion'] = companion prior = bill_page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if 'carried over' in prior.lower(): prior_session = '{} Regular Session'.format( str(int(session[:4]) - 1)) b.add_companion(bill_id, prior_session, chamber) for sponsor in meta['Introducer(s)']: b.add_sponsor(type='primary', name=sponsor) actions = self.parse_bill_actions_table(b, action_table) versions = self.parse_bill_versions_table(b, versions) self.save_bill(b)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.urlopen(page_path) page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self.subject_mapping[bill_id] bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape1995(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip() bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/'), mimetype='text/html') # Sponsorships rows = page.cssselect('center table tr') for row in rows: if row.text_content().strip() == 'Sponsor and CoSponsors': continue if row.text_content().strip() == 'Links / Committees / Status': break for a in row.cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions # The actions are in a pre table that looks like: """ SENATE HOUSE ------------------------------------- 1/13/95 Read 1st time 2/6/95 1/31/95 Favorably Reported 2/1/95 Read 2nd Time 2/7/95 2/3/95 Read 3rd Time 2/3/95 Passed/Adopted """ actions = page.cssselect('pre')[0].text_content().split('\n') actions = actions[2:] for action in actions: senate_date = action[:22].strip() action_text = action[23:46].strip() house_date = action[46:].strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape_bill(self, chamber, session, doc_type, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # bill id, title, synopsis bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip() synopsis = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, synopsis=synopsis) # sponsors for sponsor in doc.xpath('//a[@class="content"]/text()'): bill.add_sponsor('cosponsor', sponsor) # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() bill.add_action(actor, action, date, type=_categorize_action(action)) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(bill, votes_url) bill.add_source(votes_url) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) title = doc.xpath('//h3[@class="h3billright"]')[0].text_content() # TODO: grab summary (none present at time of writing) if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] else: raise ValueError('unknown bill type ' + bill_id) bill = Bill(session, chamber, bill_id, title, type=_type) bill.add_source(url) # process sponsors sponsors = _get_td(doc, 'All Sponsors:').text_content() sponsors = sponsors.replace('Delegates ', '') sponsors = sponsors.replace('Delegate ', '') sponsors = sponsors.replace('Senator ', '') sponsors = sponsors.replace('Senators ', '') sponsor_type = 'primary' for sponsor in re.split(', (?:and )?', sponsors): sponsor = sponsor.strip() if not sponsor: continue bill.add_sponsor(sponsor_type, sponsor) sponsor_type = 'cosponsor' # subjects subject_list = [] for heading in ('Broad Subject(s):', 'Narrow Subject(s):'): subjects = _get_td(doc, heading).xpath('a/text()') subject_list += [s.split(' -see also-')[0] for s in subjects if s] bill['subjects'] = subject_list # documents self.scrape_documents(bill, url.replace('stab=01', 'stab=02')) # actions self.scrape_actions(bill, url.replace('stab=01', 'stab=03')) self.save_bill(bill)
def scrape_bill_info(self, session, chambers): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" data = self.get(info_url) page = open_csv(data) chamber_map = {'H': 'lower', 'S': 'upper'} for row in page: bill_id = row['bill_num'] chamber = chamber_map[bill_id[0]] if not chamber in chambers: continue # assert that the bill data is from this session, CT is tricky assert row['sess_year'] == session if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, row['bill_title'], type=bill_type) bill.add_source(info_url) for introducer in self._introducers[bill_id]: bill.add_sponsor('primary', introducer, official_type='introducer') try: self.scrape_bill_page(bill) bill['subjects'] = self._subjects[bill_id] self.bills[bill_id] = bill except SkipBill: self.warning('no such bill: ' + bill_id) pass
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.urlopen(url) bill_page = lxml.html.fromstring(bill_html) scraped_bill_id = bill_page.xpath( "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content() bill_id = scraped_bill_id.split(' ')[0] versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] tables = bill_page.xpath("//table") metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta['Report Title'].split(";")] if "" in subs: subs.remove("") b = Bill(session, chamber, bill_id, title=meta['Measure Title'], summary=meta['Description'], referral=meta['Current Referral'], subjects=subs, type=bill_type) b.add_source(url) if not bill_id.startswith("SR"): return companion = meta['Companion'].strip() if companion: b['companion'] = companion for sponsor in meta['Introducer(s)']: b.add_sponsor(type='primary', name=sponsor) actions = self.parse_bill_actions_table(b, action_table) versions = self.parse_bill_versions_table(b, versions) self.save_bill(b)
def scrape(self, chamber, session): if int(session) < 2016: legacy = NHLegacyBillScraper(self.metadata, self.output_dir, self.strict_validation) legacy.scrape(chamber, session) # This throws an error because object_count isn't being properly incremented, # even though it saves fine. So fake the output_names self.output_names = ['1'] return self.cursor.execute( "SELECT legislationnbr, documenttypecode, " "LegislativeBody, LSRTitle, CondensedBillNo, HouseDateIntroduced, " "legislationID, sessionyear, lsr, SubjectCode FROM Legislation " "WHERE sessionyear = {} AND LegislativeBody = '{}'".format( session, body_code[chamber])) for row in self.cursor.fetchall(): bill_id = row['CondensedBillNo'] bill_title = row['LSRTitle'].replace('(New Title)', '').strip() if row['documenttypecode'] in bill_type_map: bill_type = bill_type_map[row['documenttypecode']] bill = Bill(session, chamber, bill_id, bill_title, db_id=row['legislationID'], type=bill_type) status_url = 'http://www.gencourt.state.nh.us/bill_status/bill_'\ 'status.aspx?lsr={}&sy={}&sortoption=&txtsessionyear={}'\ .format(row['lsr'], session, session) bill.add_source(status_url) self.scrape_actions(bill) self.scrape_sponsors(bill) self.scrape_votes(bill) self.scrape_subjects(bill, row['SubjectCode']) self.scrape_versions(bill) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) try: bill['versions'] = self.versions[bill_id] except KeyError: bill['versions'] = [] self.warning("No versions were found for {}".format(bill_id)) try: bill['documents'] = self.documents[bill_num] except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_votes(bill) self.fix_prefiled_action_dates(bill) return bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.get(url).text page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial resolution', 'CMR': 'concurrent memorial resolution'}[type_spec] if row[-1] != self.slug: continue bill = Bill(session, chamber, bill_id, row[3], type=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsor('primary', primary) # ftp://www.arkleg.state.ar.us/Bills/ # TODO: Keep on eye on this post 2017 to see if they apply R going forward. session_code = '2017R' if session == '2017' else session version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session_code, bill_id.replace(' ', ''))) bill.add_version(bill_id, version_url, mimetype='application/pdf') self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: bill_page = self.scrape_bill(session, abbr, bill_no) bill_page = BeautifulSoup(bill_page) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill(session, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.save_bill(the_bill) bill_no = bill_no + 1 pass
def parse_bill(self, session, chamber, line): (type, combined_id, number, title, relating_to) = line.split(u"\xe4") if ((type[0] == 'H' and chamber == 'lower') or (type[0] == 'S' and chamber == 'upper')): # basic bill info bill_id = "%s %s" % (type, number) # lookup type without chamber prefix bill_type = self.bill_types[type[1:]] # may encounter an ellipsis in the source data title = title.replace(u'\x85', '...') self.all_bills[bill_id] = Bill(session, chamber, bill_id, title, type=bill_type)