def scrape(self, session, chambers): urlified_session_id = session.replace(':', '-') url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projets-loi-%s.html' % urlified_session_id html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # scrape all the actions for this session actions = self.scrape_actions(urlified_session_id) for row in doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr'): id_td, details_td = row.xpath('td')[:2] bill_id = clean_spaces(id_td.text_content()) pdf_link = details_td.xpath('p[@class="lienAssocie"]//a')[0] bill_name = clean_spaces(pdf_link.text_content()) pdf_url = pdf_link.xpath('@href')[0] detail_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/projet-loi-%s-%s.html' % (bill_id, urlified_session_id) bill = Bill(session, 'lower', bill_id, bill_name) bill.add_source(url) bill.add_source(detail_url) bill.add_source(pdf_url) # add actions for action in actions[bill_id]: bill.add_action('lower', action['name'], action['date']) # get sponsors self.scrape_details(bill, detail_url) self.save_bill(bill)
def scrape(self, session, chambers): # Get the progress table. url = 'http://www.assembly.nl.ca/business/bills/ga47session1.htm' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) for tr in doc.xpath('//table[@class="bills"]/tr')[1:]: bill_id = clean_spaces(tr[0].text_content()).strip('*') if not bill_id: break # empty rows extend past actual list of bills if bill_id.endswith("."): bill_id = bill_id[:-1] title = clean_spaces(tr[1].text_content()) chapter = tr[-1].text_content() bill = Bill(session, 'lower', bill_id, title, type='bill') if chapter: bill['chapter'] = chapter # FIXME need to do more work to figure out what # version the text *really* is td = tr[1] bill_url = td.xpath('a/@href') if bill_url: bill.add_version(url=bill_url.pop(), name='First Reading', mimetype='text/html') # Actions and version urls. data = zip([ 'First Reading', 'Second Reading', 'Committee', 'Amendments', 'Third Reading', 'Royal Assent', 'Act'], tr[2:-1]) for action, td in data: date_text = td.text_content() date = None fmt = r'%b. %d/%Y' try: date = datetime.datetime.strptime(date_text, fmt) except ValueError: continue else: break if date is None: continue attrs = dict(action=action, date=date, actor='lower') attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) bill.add_source(url) self.save_bill(bill)
def scrape(self, session, chambers): url = 'http://www.ontla.on.ca/web/bills/bills_all.do?locale=en&parlSessionID=%s' % session html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table/tr'): id, title_td, sponsor = row.xpath('td') bill_id = id.text_content().strip() title = clean_spaces(title_td.text_content()) # pull sponsor off different page bill = Bill(session, 'lower', bill_id, title) # skip to detail page detail_url = title_td.xpath('a/@href')[0] + "&detailPage=bills_detail_status" bill.add_source(url) bill.add_source(detail_url) # get actions & sponsors self.scrape_details(bill, detail_url) if not bill['versions']: self.warning('no versions detected via normal method, using ' 'top-level page') bill.add_version('Original (current version)', title_td.xpath('a/@href')[0], mimetype='text/html') self.save_bill(bill)
def scrape_details(self, bill, detail_url): data = self.urlopen(detail_url) doc = lxml.html.fromstring(data) # versions versions = doc.xpath('//option') # skip first option, is a placeholder for version in versions[1:]: v_name = clean_spaces(version.text_content()) v_url = detail_url + '&BillStagePrintId=' + version.get('value') bill.add_version(v_name, v_url, mimetype='text/html', on_duplicate='use_new') # can get PDF links as well by opening doc & looking for 'pdf' #version_doc = lxml.html.fromstring(self.urlopen(version_url)) # sponsors for sp in doc.xpath('//span[@class="pSponsor"]/a'): bill.add_sponsor('primary', clean_spaces(sp.text_content())) for sp in doc.xpath('//span[@class="sSponsor"]/a'): bill.add_sponsor('cosponsor', clean_spaces(sp.text_content())) # actions for row in doc.xpath('//table//tr')[1:]: date, stage, activity, committee = row.xpath('td/text()') date = datetime.strptime(clean_spaces(date), "%B %d, %Y") stage = clean_spaces(stage) activity = clean_spaces(activity) committee = clean_spaces(committee) # action prefixed with stage if present action = '%s - %s' % (stage, activity) if stage else activity # default to lower, use committee if present actor = committee if committee else 'lower' bill.add_action(actor, action, date)
def scrape_actions(self, session_id): """ Scrapes all the actions for all the bills in a given session, and returns them as a dictionary keyed by bill ID. """ actions_list_url = 'http://www.assnat.qc.ca/fr/travaux-parlementaires/projets-loi/rapport/projets-loi-%s.html' % session_id actions_doc = lxml.html.fromstring(self.urlopen(actions_list_url)) # compile regular expressions for bill action dates long_date_pattern = re.compile('\d\d? \w+ \d\d\d\d') short_date_pattern = re.compile('\d\d\d\d-\d\d-\d\d') # Make a dictionary of actions for each bill number actions = dict() for td in actions_doc.xpath('//table[@id="tblListeProjetLoi"]/tbody/tr/td'): bill_number = td.xpath('div/div/div')[0].text_content() bill_number = clean_spaces(bill_number.strip(u'N\xb0')) actions[bill_number] = [] for action_row in td.xpath('div/div/table//tr'): action_name_td, action_date_td = action_row.xpath('td') action_name = action_name_td.text_content().strip(': ') action_date = clean_spaces(action_date_td.text_content()) # Parse date using regexp # Need to set locale to french since the dates are in French locale.setlocale(locale.LC_ALL, 'fr_CA.utf8') try: action_date = long_date_pattern.search(action_date).group(0) action_date = datetime.strptime(action_date, '%d %B %Y') except AttributeError: try: action_date = short_date_pattern.search(action_date).group(0) action_date = datetime.strptime(action_date, '%Y-%m-%d') except: # Can't parse the date, so giving up continue actions[bill_number].append({ 'name': action_name, 'date': action_date, }) return actions
def scrape_details(self, bill, detail_url): data = self.urlopen(detail_url) doc = lxml.html.fromstring(data) # Collect all the h3s together in a dict headings = dict() for heading in doc.xpath('//h3'): title = clean_spaces(heading.text_content()) if len(title) > 0: headings[title] = heading # sponsors # TODO: is it possible for there to be more than one sponsor? if 'Auteur' in headings: sponsor = headings['Auteur'].xpath('following-sibling::*//a')[0].text_content().strip() bill.add_sponsor('primary', sponsor)
def scrape_legislator(self, data, url, term): doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # Full name. full_name = doc.xpath('//b[starts-with(., "MLA:")]/text()').pop() if ':' in full_name: _, full_name = full_name.split(':') full_name.strip('Hon. ') full_name = clean_spaces(full_name) # Offices for xpath in [('//b[starts-with(., "MLA:")]/../' 'following-sibling::p/b/i/text()'), ('//b[starts-with(., "MLA:")]/../' 'following-sibling::p/em/b/text()'), ('//b[starts-with(., "MLA:")]/../' 'following-sibling::p/em/strong/text()'), ('//b[starts-with(., "MLA:")]/../' 'following-sibling::p/strong/em/text()')]: district = doc.xpath(xpath) if district: district = clean_spaces(district.pop()) break for xpath in [('//b[starts-with(., "MLA:")]/../' 'following-sibling::p/b/text()'), ('//b[starts-with(., "MLA:")]/../' 'following-sibling::p/strong/text()')]: party = doc.xpath(xpath) if party: party = clean_spaces(party.pop()).title() break email = doc.xpath('//a[starts-with(@href, "mailto:")]/text()').pop() xpath = '//p[starts-with(., "Phone:")]/../following-sibling::td[1]' phone = [p.text_content() for p in doc.xpath(xpath)] if len(phone) == 1: phone.append(doc.xpath('//p[starts-with(., "Phone:")]')[-1][0].tail) xpath = '//p[starts-with(., "Fax:")]/../following-sibling::td[1]' fax = [p.text_content() for p in doc.xpath(xpath)] if len(fax) == 1: fax.append(doc.xpath('//p[starts-with(., "Fax:")]')[-1][0].tail) xpath = '//p[starts-with(., "Toll free:")]/../following-sibling::td[1]' toll_free = [p.text_content() for p in doc.xpath(xpath)] leg = Legislator(term=term, full_name=full_name, email=email, district=district, party=party, chamber='lower', **data) leg['toll_free_phone'] = toll_free leg['url'] = url # Constituencies for dist_office in doc.xpath( '//b[contains(., "Constituency:")]'): dist_office = dist_office.getparent().getparent().text_content() _, dist_office = dist_office.split(':') dist_office = dist_office.strip() leg.add_office('district', 'Constituency Office', address=dist_office, phone=phone.pop(), fax=fax.pop()) # Capitol xpath = '//*[starts-with(., "Office:")]/../../text()' capitol_address = doc.xpath(xpath) capitol_address = '\n'.join(s.strip() for s in capitol_address) capitol_address = capitol_address.strip() leg.add_office('capitol', 'Office', address=capitol_address, phone=phone.pop(), fax=fax.pop()) leg.add_source(url, page="legislator detail page") return leg
def scrape(self, session, chambers): # Get the progress table. url = 'http://www.leg.bc.ca/%s/votes/progress-of-bills.htm' % session doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) session_start = self.metadata['session_details'][session]['start_date'] session_end = self.metadata['session_details'][session]['end_date'] for tr in doc.xpath('//table[@class="votestable"]/tr')[1:]: bill_id = clean_spaces(tr[0].text_content()).strip('*') if 'Ruled out of order' in bill_id: continue title = clean_spaces(tr[1].text_content()) if title == 'Title': # This is a header row. continue sponsor = clean_spaces(tr[2].text_content()) chapter = tr[-1].text_content() bill = Bill(session, 'lower', bill_id, title, type='bill') bill.add_sponsor(name=sponsor, type='primary') if chapter: bill['chapter'] = chapter # Actions and version urls. data = zip([ 'Reading', 'Second Reading', 'Committee', 'Report', 'Amended', 'Third Reading', 'Royal Assent', 'S.B.C. Chap. No.'], tr[3:-1]) for action, td in data: version_url = td.xpath('a/@href') if version_url: bill.add_version(url=version_url.pop(), name=action, mimetype='text/html') date_text = td.text_content() date = None for fmt in [r'%b %d', r'%b. %d']: try: date = datetime.datetime.strptime(date_text, fmt) except ValueError: continue else: break if date is None: continue # guess the year of the action date = datetime.datetime(month=date.month, day=date.day, year=session_start.year) if date < session_start or date > session_end: date = datetime.datetime(month=date.month, day=date.day, year=session_end.year) if date < session_start or date > session_end: self.error('action %s appears to have occured on %s, ' 'which is outside of session', action, date) # XXX: it should be noted that this isn't perfect # if a session is longer than a year there's a chance we get # the action date wrong (with a preference for the earliest # year) # in practice this doesn't seem to happen, and hopefully # if/when it does they will add years to these action dates attrs = dict(action=action, date=date, actor='lower') attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) bill.add_source(url) self.save_bill(bill)