def scrape(self, session, chambers): url = 'http://www.legassembly.sk.ca/legislative-business/bills/' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) url = doc.xpath('//a[text() = "Progress of Bills"]/@href').pop() filename, resp = self.urlretrieve(url) doc = pdf_to_lxml(filename) actions = [ 'First Reading', 'Crown recommendation', 'Committee', 'Second Reading', 'Committee', 'Amend Date', 'Third Reading', 'Royal Assent', 'In Effect' ] for a in doc.xpath('//a[contains(@href, "legdocs/Bills")]'): bill_id = a.text_content().strip() predicate = lambda el: el.tag == 'br' sibs = list(takewhile(predicate, a.itersiblings())) # If the star is missing, insert it to avoid complicated code. if not sibs[0].tail.strip() == '*': sibs.insert(0, DummyBR('br', None, '*')) title_chunks = [sibs[1].tail.strip()] sponsor = sibs[2].tail.strip() dates = sibs[3].tail.split(u'\xa0') title_chunks.extend((br.tail or '').strip() for br in sibs[4:]) title = ' '.join(title_chunks).strip() bill = Bill(session, 'lower', bill_id, title, type='bill') bill.add_sponsor(name=sponsor, type='primary') for action, date in zip(actions, dates): date = datetime.datetime.strptime(date.strip(), '%Y-%m-%d') attrs = dict(action=action, date=date, actor='lower') attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) bill.add_source(url) bill.add_version('Introduced', a.attrib['href'], mimetype='application/pdf') self.save_bill(bill)
def scrape_roll_call(self, chamber, session, idx): url = self.roll_call_url_format % locals() try: filename, response = self.urlretrieve(url) except scrapelib.HTTPError: return False try: xml = pdf_to_lxml(filename) finally: os.remove(filename) print lxml.etree.tostring(xml, pretty_print=True) return True
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READING' in motion: type = 'reading:3' else: type = 'other' vote = Vote(chamber, None, motion, None, None, None, None) vote['type'] = type vote.add_source(url) (fd, temp_path) = tempfile.mkstemp() self.urlretrieve(url, temp_path) html = pdf_to_lxml(temp_path) os.close(fd) os.remove(temp_path) vote_type = None total_re = re.compile('^Total--(\d+)$') body = html.xpath('string(/html/body)') date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return vote['date'] = datetime.datetime.strptime(date, '%m/%d/%Y') for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() if not line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other' }[line] elif line in ('Total', '--'): vote_type = None elif vote_type: match = total_re.match(line) if match: vote['%s_count' % vote_type] = int(match.group(1)) elif vote_type == 'yes': vote.yes(line) elif vote_type == 'no': vote.no(line) elif vote_type == 'other': vote.other(line) # tally counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote['yes_count'] > (vote['no_count'] + vote['other_count']): vote['passed'] = True else: vote['passed'] = False bill.add_vote(vote)
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READING' in motion: type = 'reading:3' else: type = 'other' vote = Vote(chamber, None, motion, None, None, None, None) vote['type'] = type vote.add_source(url) with self.urlopen(url) as text: (fd, temp_path) = tempfile.mkstemp() with os.fdopen(fd, 'wb') as w: w.write(text) html = pdf_to_lxml(temp_path) os.remove(temp_path) vote_type = None total_re = re.compile('^Total--(\d+)$') body = html.xpath('string(/html/body)') date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return vote['date'] = datetime.datetime.strptime(date, '%m/%d/%Y') for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() if not line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = {'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other'}[line] elif line in ('Total', '--'): vote_type = None elif vote_type: match = total_re.match(line) if match: vote['%s_count' % vote_type] = int(match.group(1)) elif vote_type == 'yes': vote.yes(line) elif vote_type == 'no': vote.no(line) elif vote_type == 'other': vote.other(line) # tally counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote['yes_count'] > (vote['no_count'] + vote['other_count']): vote['passed'] = True else: vote['passed'] = False bill.add_vote(vote)
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READINT' in motion: type = 'reading:3' else: type = 'other' vote = Vote(chamber, None, motion, None, None, None, None) vote['type'] = type vote.add_source(url) with self.urlopen(url) as text: (fd, temp_path) = tempfile.mkstemp() with os.fdopen(fd, 'wb') as w: w.write(text) html = pdf_to_lxml(temp_path) os.remove(temp_path) vote_type = None total_re = re.compile('^Total--(\d+)$') body = html.xpath('string(/html/body)') date_match = re.search('%s (\d{4,4})' % bill['bill_id'], body) try: date = date_match.group(1) except AttributeError: print "BAD VOTE" return month = int(date[0:2]) day = int(date[2:4]) date = datetime.date(int(bill['session']), month, day) vote['date'] = date for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() if not line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other' }[line] elif vote_type: match = total_re.match(line) if match: vote['%s_count' % vote_type] = int(match.group(1)) elif vote_type == 'yes': vote.yes(line) elif vote_type == 'no': vote.no(line) elif vote_type == 'other': vote.other(line) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote['yes_count'] > (vote['no_count'] + vote['other_count']): vote['passed'] = True else: vote['passed'] = False bill.add_vote(vote)
def scrape_vote(self, bill, name, url): match = re.match("^(Senate|House) Vote on [^,]*,(.*)$", name) if not match: return chamber = {"Senate": "upper", "House": "lower"}[match.group(1)] motion = match.group(2).strip() if motion.startswith("FINAL PASSAGE"): type = "passage" elif motion.startswith("AMENDMENT"): type = "amendment" elif "ON 3RD READING" in motion: type = "reading:3" else: type = "other" vote = Vote(chamber, None, motion, None, None, None, None) vote["type"] = type vote.add_source(url) (fd, temp_path) = tempfile.mkstemp() self.urlretrieve(url, temp_path) html = pdf_to_lxml(temp_path) os.close(fd) os.remove(temp_path) vote_type = None total_re = re.compile("^Total--(\d+)$") body = html.xpath("string(/html/body)") date_match = re.search("Date: (\d{1,2}/\d{1,2}/\d{4})", body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return vote["date"] = dt.datetime.strptime(date, "%m/%d/%Y") for line in body.replace(u"\xa0", "\n").split("\n"): line = line.replace(" ", "").strip() if not line: continue if line in ("YEAS", "NAYS", "ABSENT"): vote_type = {"YEAS": "yes", "NAYS": "no", "ABSENT": "other"}[line] elif line in ("Total", "--"): vote_type = None elif vote_type: match = total_re.match(line) if match: vote["%s_count" % vote_type] = int(match.group(1)) elif vote_type == "yes": vote.yes(line) elif vote_type == "no": vote.no(line) elif vote_type == "other": vote.other(line) # tally counts vote["yes_count"] = len(vote["yes_votes"]) vote["no_count"] = len(vote["no_votes"]) vote["other_count"] = len(vote["other_votes"]) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote["yes_count"] > (vote["no_count"] + vote["other_count"]): vote["passed"] = True else: vote["passed"] = False bill.add_vote(vote)
def scrape(self, term, chambers): url = ('http://www.gov.mb.ca/legislature/committees/membership.pdf') filename, resp = self.urlretrieve(url) doc = pdf_to_lxml(filename, type='xml') import pdb;pdb.set_trace()