def get_bill_information(self, bill_id, chamber, session): with self.urlopen(BILL_INFO_URL, 'POST', body="hListBills=" + bill_id) as bill_info_page: self.log("Got bill info") page = lxml.html.fromstring(bill_info_page) # TODO: check whether page is error page and raise custom exception defined above bs = page.xpath('//div/b') for b in bs: containing_div = b.getparent() if b.text == "BY": l = containing_div.text_content().strip(u'BY\xa0').split(',') sponsors = map(lambda x: x.strip(' '), l) if b.text.strip(u',\xa0') == "ENTITLED": title = containing_div.text_content().lstrip(u'ENTITLED,\xa0') divs = page.xpath('//div') bill_type = "" for div in divs: text = div.text_content() for ind, reg in enumerate(self.type_regs): if reg.match(text): bill_type = self.bill_types[ind] bill = Bill(session, chamber, bill_id, title, type=bill_type) for ind, sponsor in enumerate(sponsors): if ind == 0: bill.add_sponsor('primary', sponsor) else: bill.add_sponsor('cosponsor', sponsor) return bill
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = BeautifulSoup(bill_page) # get all the info needed to record the bill bill_id = bill_page.find(id="lblBillNum").b.font.contents[0] bill_title = bill_page.find(id="lblBillTitle").font.string bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0] bill_lr = bill_page.find(id="lblLRNum").font.string bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0] bill_sponsor_link = bill_page.find(id="hlSponsor").href bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.find(id="hlCoSponsors") if cosponsor_tag and 'href' in cosponsor_tag: self.parse_senate_cosponsors(bill, cosponsor_tag['href']) # get the actions action_url = bill_page.find(id="hlAllActions")['href'] self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.find(id="hlFullBillText") if versions_url: self.parse_senate_bill_versions(bill, versions_url['href']) self.save_bill(bill)
def scrape_bill_info(self, chamber, session): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" page = self.urlopen(info_url) page = csv.DictReader(StringIO.StringIO(page)) abbrev = {'upper': 'S', 'lower': 'H'}[chamber] for row in page: bill_id = row['bill_num'] if not bill_id[0] == abbrev: continue if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, row['bill_title'].decode('latin-1'), type=bill_type) bill.add_source(info_url) self.scrape_bill_page(bill) for introducer in self._introducers[bill_id]: bill.add_sponsor('introducer', introducer) bill['subjects'] = self._subjects[bill_id] self.bills[bill_id] = bill
def scrape_bill(self, chamber, session, bill_id): """ Scrapes documents, actions, vote counts and votes for a given bill. """ session_id = self.get_session_id(session) bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber]) response = self.get(bill_json_url) page = json.loads(response.content) bill_title = page['ShortTitle'] bill_id = page['Number'] internal_id = page['BillId'] bill_type = self.get_bill_type(bill_id) bill = Bill( session=session, chamber=chamber, bill_id=bill_id, title=bill_title, type=bill_type ) self.scrape_actions(bill, page) self.scrape_versions(bill, internal_id) self.scrape_sponsors(bill, internal_id) self.scrape_subjects(bill, internal_id) bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format(internal_id, session_id) bill.add_source(bill_url) bill = self.sort_bill_actions(bill) self.save_bill(bill)
def scrape_regular_row(self, chamber, session, row): """Returns bill attributes from row.""" params = {} params['session'] = session params['chamber'] = chamber b = row.xpath('td/font/a[contains(@id, "HyperLink1")]') if b: # Ignore if no match bill_status_url = b[0].attrib['href'] bill_url = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text params['bill_id'] = b[0].xpath('font')[0].text.split()[0] params['title'] = row.xpath('td/font/span[contains(@id, "_Label1")]/u/font')[0].text subject = row.xpath('td/font/span[contains(@id, "_Label6")]')[0].text subject = subject.replace('RELATING TO ', '') # Remove lead text params['subjects'] = [subject.replace('.', '')] params['description'] = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text sponsors = row.xpath('td/font/span[contains(@id, "_Label7")]')[0].text params['companion'] = row.xpath('td/font/span[contains(@id, "_Label8")]')[0].text bill = Bill(**params) for sponsor in sponsors.split(', '): bill.add_sponsor('primary', sponsor) actions = self.scrape_actions(bill, bill_status_url) bill.add_source(bill_status_url) self.save_bill(bill) return
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) with self.urlopen(url) as page: page = lxml.etree.fromstring(page).xpath("//wa:Legislation", namespaces=self._ns)[0] title = page.xpath("string(wa:LongDescription)", namespaces=self._ns) bill_type = page.xpath( "string(wa:ShortLegislationType/wa:LongLegislationType)", namespaces=self._ns).lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] version_url = ("http://www.leg.wa.gov/pub/billinfo/2011-12/" "Htm/Bills/%s %ss/%s.htm" % (chamber_name, bill_type.title(), bill_num)) bill.add_version(bill_id, version_url) self.scrape_sponsors(bill) self.scrape_actions(bill) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error('Skipping %s w/ 503', bill_url) return else: raise bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()')[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath('string(//div[contains(@class,"field-name-field-bill-summary")])') bill_summary = bill_summary.strip() bill = Bill(session, chamber, bill_number, bill_title, summary=bill_summary) bill.add_source('{}{}'.format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_votes(bill, page) self.scrape_amendments(bill, page) self.save_bill(bill)
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1) if type_abbr == 'B': btype = ['bill'] elif type_abbr == 'R': btype = ['resolution'] bill_id = "%s%s %s" % (bill_abbr(chamber), type_abbr, bill_num) url = info_url(chamber, session, special, type_abbr, bill_num) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath( "//td[text() = 'Short Title:']/following-sibling::td")[0] title = title.text.strip() bill = Bill(session, chamber, bill_id, title, type=btype) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history(bill, history_url(chamber, session, special, type_abbr, bill_num)) self.parse_votes(bill, vote_url(chamber, session, special, type_abbr, bill_num)) self.save_bill(bill)
def scrape_bill_info(self, chamber, session): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" page = self.urlopen(info_url) page = csv.DictReader(StringIO.StringIO(page)) abbrev = {"upper": "S", "lower": "H"}[chamber] for row in page: bill_id = row["bill_num"] if not bill_id[0] == abbrev: continue if re.match(r"^(S|H)J", bill_id): bill_type = "joint resolution" elif re.match(r"^(S|H)R", bill_id): bill_type = "resolution" else: bill_type = "bill" bill = Bill(session, chamber, bill_id, row["bill_title"].decode("latin-1"), type=bill_type) bill.add_source(info_url) self.scrape_bill_page(bill) for introducer in self._introducers[bill_id]: bill.add_sponsor("introducer", introducer) bill["subjects"] = self._subjects[bill_id] self.bills[bill_id] = bill
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Get the basic parts of the bill bill_id = self.get_node(doc, '//h1/text()') self.logger.debug(bill_id) bill_title_text = self.get_node(doc, '//h2[text()[contains(.,' '"Description")]]/following-sibling::p/text()') if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node(doc, '//a[text()[contains(.,' '"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node(long_desc_page, '//h1/' 'following-sibling::p/text()') if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = 'No title found.' self.logger.warning('No title found for {}.'.format(bill_id)) self.logger.debug(bill_title) bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) # Add source bill.add_source(bill_detail_url) # Add subjects. Currently we are not mapping to Open States # standardized subjects, so use 'scraped_subjects' bill['scraped_subjects'] = self._subject_mapping[bill_id] # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) page = self.urlopen(url) page = lxml.etree.fromstring(page.bytes) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] mimetype = 'text/html' version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/" "Htm/Bills/%s %ss/%s.htm" % (biennium, chamber_name, bill_type.title(), bill_num)) # Sometimes the measure's version_url isn't guessable. When that happens # have to get the url from the source page. try: version_resp = self.get(version_url) if version_resp.status_code != 200: webpage = self.get(fake_source).text webdoc = lxml.html.fromstring(webpage) version_url = webdoc.xpath('//a[contains(@href, "billdocs")]/@href')[-1] if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' except scrapelib.HTTPError: pass bill.add_version(bill_id, version_url, mimetype=mimetype) self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_votes(bill) self.fix_prefiled_action_dates(bill) return bill
def parse_senate_billpage(self, bill_url, year): bill_page = self.urlopen(bill_url) bill_page = lxml.html.fromstring(bill_page) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() # print "bill id = "+ bill_id bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self.subjects: subs = self.subjects[bid] self.log("With subjects for this bill") self.log(bid) bill = Bill(year, 'upper', bill_id, bill_desc, bill_lr=bill_lr, type=bill_type, subjects=subs) bill.add_source(bill_url) # Get the primary sponsor sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'): self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//*[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] # print "actions = %s" % action_url self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//*[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'): self.parse_senate_bill_versions( bill, versions_url[0].attrib['href']) self.save_bill(bill)
def scrape(self, chamber, session): self.log(self.metadata['session_details']) self.site_id = self.metadata['session_details'][session]['internal_id'] chamber_piece = {'upper': 'Senate', 'lower': 'House+of+Representatives'}[chamber] # resolutions # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece cookie = self.refresh_session() agent = FakeFirefoxURLopener() agent.addheader('Cookie', cookie) page = agent.open(url) doc = lxml.html.fromstring(page.read()) # bills are all their own table with cellspacing=4 (skip first) bill_tables = doc.xpath('//table[@cellspacing="4"]') for bt in bill_tables[1:]: # each table has 3 rows: detail row, description, blank details, desc, _ = bt.xpath('tr') # first <tr> has img, button, sponsor, topic, current house # current status, committee, committee2, last action _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td') # pull bill_id out of script tag (gross) bill_id = bill_id_re.search(button.text_content()).group() self.log(bill_id) oid = btn_re.search(button.text_content()).groups()[0] sponsor = sponsor.text_content() topic = topic.text_content() com1 = com1.text_content() com2 = com2.text_content() desc = desc.text_content() # create bill bill = Bill(session, chamber, bill_id, desc.strip(), topic=topic) bill.add_sponsor(sponsor, 'primary') self.get_sponsors(bill, oid) self.get_actions(bill, oid) # craft bill URL session_fragment = '2010rs' type_fragment = 'bills' bill_id_fragment = bill_id.lower() bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % ( session_fragment, type_fragment, bill_id_fragment) bill.add_version('bill text', bill_text_url) self.save_bill(bill)
def scrape(self, session, chambers): urlified_session_id = session.replace(':', '-') url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projets-loi-%s.html' % urlified_session_id html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # scrape all the actions for this session actions = self.scrape_actions(urlified_session_id) for row in doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr'): id_td, details_td = row.xpath('td')[:2] bill_id = clean_spaces(id_td.text_content()) pdf_link = details_td.xpath('p[@class="lienAssocie"]//a')[0] bill_name = clean_spaces(pdf_link.text_content()) pdf_url = pdf_link.xpath('@href')[0] detail_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projet-loi-%s-%s.html' % (bill_id, urlified_session_id) bill = Bill(session, 'lower', bill_id, bill_name) bill.add_source(url) bill.add_source(detail_url) bill.add_source(pdf_url) # add actions for action in actions[bill_id]: bill.add_action('lower', action['name'], action['date']) # get sponsors self.scrape_details(bill, detail_url) self.save_bill(bill)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billindex/" "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev)) page = lxml.html.fromstring(self.urlopen(url)) for tr in page.xpath("//tr[@valign='middle']")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') + tr.xpath('td[10]//a')): bill.add_version(a.text, a.get('href')) # documents fnote = tr.xpath('td[7]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[12]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('#legislation h1')[0].text_content().strip() bill_id = name.split(' - ')[0].strip() bill = Bill(session, chamberName, bill_id, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor('', a.text_content().strip()) # Actions for row in page.cssselect('#history tr')[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue date = datetime.datetime.strptime(date, '%m/%d/%Y') if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in page.cssselect('#versions a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: doc = self.lxmlize(url) except scrapelib.HTTPError as e: assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip() summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, summary=summary) bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() bill.add_action(actor, action, date, **_categorize_action(action)) if action.lower().find('sponsor') != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if chamber: bill.add_sponsor(spontype, sponsor, official_type=official_type, chamber=chamber) else: bill.add_sponsor(spontype, sponsor, official_type=official_type) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(session, bill, votes_url) self.save_bill(bill)
def _parse_bill(self, session, chamber, source_url, line): if line: (type, combined_id, number, title, relating_to) = line.split("\xe4") if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'): # # basic bill info bill_id = "%s %s" % (type, number.zfill(4)) bill = Bill(session, chamber, bill_id, title) bill.add_source(source_url) # # add actions if self.actionsByBill.has_key(bill_id): for a in self.actionsByBill[bill_id]: bill.add_action(a['actor'], a['action'], a['date']) if self.load_versions_sponsors: # add versions and sponsors versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id) #print "versionsSponsors: %s" % str(versionsSponsors) if versionsSponsors: for ver in versionsSponsors['versions']: bill.add_version(ver['name'], ver['url']) sponsorType = 'primary' if len(versionsSponsors['sponsors']) > 1: sponsorType = 'cosponsor' for name in versionsSponsors['sponsors']: bill.add_sponsor(sponsorType, name) # save - writes out JSON self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] bill.add_sponsor('primary', author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content() bill.add_action(chamber, action, date) # also has an associated version if tds[1].xpath('a'): bill.add_version(action, tds[1].xpath('a/@href')[0]) bill.add_source(url) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect('table') # Bill name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[2].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[-1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect('center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('center table center')[0] for row in center.cssselect('table')[-2].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table')[-1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect("table") # Bill name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version("Current", url.replace("/sum/", "/fulltext/")) # Sponsorships for a in tables[2].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions for row in tables[-1].cssselect("tr"): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if "/" not in senate_date and "/" not in house_date: continue if senate_date: bill.add_action("upper", action_text, senate_date) if house_date: bill.add_action("lower", action_text, house_date) self.save_bill(bill)
def scrape2001(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect("table center table") # Bill name = tables[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions center = page.cssselect("table center")[-1] for row in center.cssselect("table table")[0].cssselect("tr")[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if "/" not in date: continue if action_text.startswith("Senate"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("upper", action_text, date) elif action_text.startswith("House"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("lower", action_text, date) # Versions for row in center.cssselect("table table")[1].cssselect("a"): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href"))) self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' # main bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'], type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if bill_data['LONGTITLE']: bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) self.scrape_html(bill) self.save_bill(bill)
def scrape_bill_status_page(self, url, params={}): """Scrapes the status page url, populating parameter dict and returns bill """ with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) params["bill_id"] = page.xpath('//h3[contains(@class, "center")]/a')[0].text.split()[0] params["title"] = page.xpath( '//div[div[contains( \ ., "Report Title")]]/div[contains(@class, "rightside")]' )[0].text.strip() sponsors = page.xpath( '//div[div[contains( \ ., "Introducer")]]/div[contains(@class, "rightside")]' )[0].text subject = page.xpath( '//div[div[contains( \ ., "Measure Title")]]/div[contains(@class, "rightside")]' )[0].text.strip() subject = subject.replace("RELATING TO ", "") # Remove lead text params["subject"] = subject.replace(".", "") params["description"] = page.xpath( '//div[div[contains( \ ., "Description")]]/div[contains(@class, "rightside")]' )[0].text params["companion"] = page.xpath( '//div[div[contains( \ ., "Companion")]]/div[contains(@class, "rightside")]' )[0].text if params["title"] == "": params["title"] = params["subject"] actions = [] table = page.xpath('//table[tr/th[contains(., "Date")]]')[0] for row in table.xpath("tr[td]"): # Ignore table header row action_params = {} cells = row.xpath("td") if len(cells) == 3: ch = cells[1].text action_params["actor"] = house[ch] action_params["action"] = cells[2].text action_date = cells[0].text.split()[0] # Just get date, ignore any time. try: action_params["date"] = datetime.strptime(action_date, "%m/%d/%y") except ValueError: # Try a YYYY format. action_params["date"] = datetime.strptime(action_date, "%m/%d/%Y") actions.append(action_params) bill = Bill(**params) bill.add_sponsor("primary", sponsors) for action_params in actions: bill.add_action(**action_params) self.save_bill(bill) return bill
def scrape_xml(self, chamber, session): start_letter = "S" if chamber == "upper" else "H" sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"} version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session xml = self.urlopen(summary_url) doc = lxml.etree.fromstring(xml) for bxml in doc.xpath("//Bill"): type = bxml.get("Type") # if this is from the other chamber skip it if not type.startswith(start_letter): continue bill_id = type + bxml.get("Num") + bxml.get("Suffix") if type in ("HB", "SB"): type = "bill" elif type in ("HR", "SR"): type = "resolution" else: raise ValueError("unknown type: %s" % type) # use short_title as title and long as description title = bxml.xpath("Short_Title/text()")[0] description = bxml.xpath("Title/text()")[0] bill = Bill(session, chamber, bill_id, title, type=type, description=description) bill.add_source(summary_url) for sponsor in bxml.xpath("Sponsor"): sponsor_name, code = sponsor.text.rsplit(" ", 1) sponsor_name = sponsor_name.replace(",", ", ") bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code) for version in bxml.xpath("Versions/Version"): # NOTE: it is possible to get PDF versions by using .get('Id') # ex. URL: legis.ga.gov/Legislation/20112012/108025.pdf # for now we just get HTML description, file_id = version.xpath("*/text()") bill.add_version(description, version_url + file_id) for action in bxml.xpath("StatusHistory/Status"): date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S") code = action.get("StatusCode") if code in ("EFF", "Signed Gov"): actor = "executive" elif code[0] == "S": actor = "upper" elif code[0] == "H": actor = "lower" atype = self._action_codes[code] bill.add_action(actor, action.text, date, atype) self.save_bill(bill)
def scrape_bill(self, term, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0: chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text if '*' in chamber1: bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = chamber2.replace(' ', '') else: bill_id = chamber2.replace(' ', '')[1:len(chamber2)] secondary_bill_id = chamber1.replace(' ', '') primary_chamber = 'lower' if 'H' in bill_id else 'upper' else: primary_chamber = 'lower' if 'H' in chamber1 else 'upper' bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = None title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') #NEED TO ADD SECONDARY ACTIONS bill.add_action(primary_chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) html = self.urlopen(url) if "error '80020009'" in html: self.warning('asp error on page, skipping %s', bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): aname = self.clean_name(aname).strip() if aname: bill.add_sponsor('primary', aname) co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()); action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype,action = self.parse_action(chamber,bill,action,action_url,date) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern,action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url[0], action,date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url[0]) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape(self, chamber, session): year = year_from_session(session) url = bills_url(year) with self.urlopen(url) as bills_page_html: bills_page = lxml.html.fromstring(bills_page_html) table_rows = bills_page.cssselect('tr') # Eliminate empty rows table_rows = table_rows[0:len(table_rows):2] for row in table_rows: row_elements = row.cssselect('td') bill_document = row_elements[0] bill_document.make_links_absolute(BASE_URL) element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip('.pdf') bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content()) sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split('--') bill = Bill(session, chamber, bill_id, title) bill.add_version('current', bill_document_link) if separated_sponsors[1] == '(NONE)': bill.add_sponsor('primary', separated_sponsors[0]) else: bill.add_sponsor('cosponsor', separated_sponsors[0]) bill.add_sponsor('cosponsor', separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute(BASE_URL) element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = BASE_URL + link.split('?Open&target=')[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = BASE_URL + link.split('?Open&target=')[1] self.scrape_votes(frame_link, chamber, bill)
def scrape_bill_info(self, chamber, session): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" page = urllib2.urlopen(info_url) page = csv.DictReader(page) abbrev = {'upper': 'S', 'lower': 'H'}[chamber] for row in page: bill_id = row['bill_num'] if not bill_id[0] == abbrev: continue bill = Bill(session, chamber, bill_id, row['bill_title']) bill.add_source(info_url) self.bills[bill_id] = bill