def process_vote(self, data): chamber = parse_psuedo_id(data['organization'])['classification'] bill_chamber, bill_id = self.get_bill_details(data['bill']) if chamber == 'legislature': chamber = 'upper' if bill_chamber == 'legislature': bill_chamber = 'upper' yes_count = None no_count = None other_count = 0 for vc in data['counts']: if vc['option'] == 'yes': yes_count = vc['value'] elif vc['option'] == 'no': no_count = vc['value'] else: other_count += vc['value'] vote = Vote( chamber=chamber, date=parse_date(data['start_date']), motion=data['motion_text'], passed=data['result'] == 'pass', yes_count=yes_count, no_count=no_count, other_count=other_count, action=data['bill_action'], # TODO: was data['motion_classification'], type='other', session=data['legislative_session'], bill_chamber=bill_chamber, bill_id=bill_id, ) for vr in data['votes']: if vr['option'] == 'yes': vote.yes(vr['voter_name']) elif vr['option'] == 'no': vote.no(vr['voter_name']) else: vote.other(vr['voter_name']) for source in data['sources']: vote.add_source(source['url']) vote.update(**data['extras']) self.save_vote(vote)
def process_vote(self, data): chamber = parse_psuedo_id(data['organization'])['classification'] bill_chamber, bill_id = self.get_bill_details(data['bill']) if chamber == 'legislature': chamber = 'upper' if bill_chamber == 'legislature': bill_chamber = 'upper' yes_count = None no_count = None other_count = 0 for vc in data['counts']: if vc['option'] == 'yes': yes_count = vc['value'] elif vc['option'] == 'no': no_count = vc['value'] else: other_count += vc['value'] vote = Vote(chamber=chamber, date=parse_date(data['start_date']), motion=data['motion_text'], passed=data['result'] == 'pass', yes_count=yes_count, no_count=no_count, other_count=other_count, action=data['bill_action'], # TODO: was data['motion_classification'], type='other', session=data['legislative_session'], bill_chamber=bill_chamber, bill_id=bill_id, ) for vr in data['votes']: if vr['option'] == 'yes': vote.yes(vr['voter_name']) elif vr['option'] == 'no': vote.no(vr['voter_name']) else: vote.other(vr['voter_name']) for source in data['sources']: vote.add_source(source['url']) vote.update(**data['extras']) self.save_vote(vote)
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split("\n") lines = [line. strip(). replace("–", "-"). replace("―", '"'). replace("‖", '"'). replace('“', '"'). replace('”', '"') for line in lines] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter([line for line in lines if not( line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line))]) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or \ "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)' # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r'.* the vote was:\s*' # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + lines.next() try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning("This motion did not pertain to legislation: {}". format(line)) continue # Get the motion text motion_re = r''' ^On\sthe\squestion\s # Precedes any motion " # Motion is preceded by a quote mark (Shall\s.+?\??) # The motion text begins with "Shall" \s*"\s+ # Motion is followed by a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ '''.format( bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '' ) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE).group(1) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes, passed = self.parse_votes(lines) #at the very least, there should be a majority #for the bill to have passed, so check that, #but if the bill didn't pass, it could still be OK if it got a majority #eg constitutional amendments assert (passed == (votes['yes_count'] > votes['no_count'])) or (not passed) #also throw a warning if the bill failed but got a majority #it could be OK, but is probably something we'd want to check if not passed and votes['yes_count'] > votes['no_count']: self.logger.warning("The bill got a majority but did not pass. Could be worth confirming.") vote = Vote(motion=re.sub('\xad', '-', motion), passed=passed, chamber=chamber, date=date, session=session, bill_id=bill_id, bill_chamber=bill_chamber, **votes) vote.update(votes) vote.add_source(url) self.save_vote(vote)
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) xml = convert_pdf(filename) try: et = lxml.etree.fromstring(xml) except lxml.etree.XMLSyntaxError: self.logger.warning('Skipping invalid pdf: %r' % filename) return lines = self._journal_lines(et) while True: try: line = next(lines) except StopIteration: break text = gettext(line) # Go through with vote parse if any of # these conditions match. if 'Shall' in text: if 'bill pass?' in text: pass elif 'resolution' in text: pass elif 'amendment' in text: pass else: continue else: continue # Get the bill_id. while True: line = next(lines) text += gettext(line) m = re.search(r'\(\s*([A-Z\.]+\s+\d+)\s*\)', text) if m: bill_id = m.group(1) break motion = text.strip() motion = re.sub(r'\s+', ' ', motion) motion, _ = motion.rsplit('(') motion = motion.replace('"', '') motion = motion.replace(u'“', '') motion = motion.replace(u'\u201d', '') motion = motion.replace(u' ,', ',') motion = motion.strip() motion = re.sub(r'[SH].\d+', lambda m: ' %s ' % m.group(), motion) motion = re.sub(r'On the question\s*', '', motion, flags=re.I) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): bill_id = bill_id.replace(word, letter) bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes = self.parse_votes(lines) totals = filter(lambda x: isinstance(x, int), votes.values()) passed = (1.0 * votes['yes_count'] / sum(totals)) >= 0.5 vote = Vote(motion=motion, passed=passed, chamber=chamber, date=date, session=session, bill_id=bill_id, bill_chamber=bill_chamber, **votes) vote.update(votes) vote.add_source(url) self.save_vote(vote)
def scrape(self, chamber, session): self.all_bills = {} self.slug = self.metadata['session_details'][session]['slug'] page = self.lxmlize(self.bill_directory_url.format(self.slug.upper())) page.make_links_absolute(self.base_url) ulid = 'senateBills' if chamber == 'upper' else 'houseBills' # id of <ul> header = page.xpath("//ul[@id='{0}_search']".format(ulid))[0] #Every ul with a data-load-action and an id bill_list_pages = header.xpath(".//ul[boolean(@data-load-action)" " and boolean(@id)]/@data-load-action") bill_anchors = [] for bill_list_url in bill_list_pages: bill_list_page = self.lxmlize('{}{}'.format(self.base_url, bill_list_url)) bill_list_page.make_links_absolute(self.base_url) bill_anchors.extend(bill_list_page.xpath('//a') or []) ws = re.compile(r"\s+") def _clean_ws(txt): """Remove extra whitespace from text.""" return ws.sub(' ', txt).strip() for a in bill_anchors: bid = ws.sub('', a.text_content()) # bill id bill_summary = _clean_ws(a.get('title')) # bill title is added below bill = Bill(session, chamber, bid, title='', summary=bill_summary) page = self.lxmlize(a.get('href')) versions = page.xpath('//ul[@class="dropdown-menu"]/li/span/' + 'a[contains(@title, "Get the Pdf")]/@href') measure_info = {} info = page.xpath("//table[@id='measureOverviewTable']/tr") for row in info: key, value = row.xpath("./*") key = key.text.replace(':','').strip() measure_info[key] = value for sponsor in measure_info['Chief Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor( type='primary', name=sponsor.text_content()) for sponsor in measure_info['Regular Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor( type='cosponsor', name=sponsor.text_content()) title = _clean_ws(measure_info['Bill Title'].text_content()) # some bill titles need to be added manually if self.slug == "2013R1" and bid == "HB2010": title = ("Relating to Water Resources Department contested" "case proceedings.") bill['title'] = title for version in versions: name = version.split("/")[-1] bill.add_version(name=name, url=version, mimetype='application/pdf') history_url = self.create_url('Measures/Overview/GetHistory/{bill}', bid) history = self.lxmlize(history_url).xpath("//table/tr") for entry in history: wwhere, action = [_clean_ws(x.text_content()) for x in entry.xpath("*")] vote_cleaning_re = r'(.*?)((Ayes)|(Nays),\s.*)' if re.match(vote_cleaning_re, action): action = re.search(vote_cleaning_re, action).groups()[0] wwhere = re.match( r"(?P<when>.*) \((?P<where>.*)\)", wwhere).groupdict() action_chamber = {"S": "upper", "H": "lower"}[wwhere['where']] when = "%s-%s" % (self.slug[:4], wwhere['when']) when = dt.datetime.strptime(when, "%Y-%m-%d") types = [] for expr, types_ in self.action_classifiers: m = re.match(expr, action) if m: types += types_ if types == []: types = ['other'] # actor, action, date, type, committees, legislators bill.add_action(action_chamber, action, when, type=types) # Parse and store Vote information vote_id = entry.xpath('./td/a[contains(@href, "otes-")]/@href') if not vote_id: continue elif "#measureVotes-" in vote_id[0]: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/MeasureVotes?id={1}". \ format(self.slug, vote_id) else: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/CommitteeReports/MajorityReport/{1}". \ format(self.slug, vote_id) votes = self._get_votes(vote_url) if not any(len(x) for x in votes.values()): self.warning("The votes webpage was empty for " + "action {0} on bill {1}.".format(action, bid)) continue passed = ( float(len(votes["yes_votes"])) / (len(votes["yes_votes"]) + len(votes["no_votes"])) > 0.5 ) vote = Vote( chamber=chamber, date=when, motion=action, passed=passed, yes_count=len(votes["yes_votes"]), no_count=len(votes["no_votes"]), other_count=len(votes["other_votes"]), session=session, bill_id=bid, bill_chamber=action_chamber ) vote.update(votes) bill_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/Overview/{1}".format(self.slug, bid) vote.add_source(bill_url) bill.add_vote(vote) amendments_url = self.create_url( 'Measures/ProposedAmendments/{bill}', bid) amendments = self.lxmlize(amendments_url).xpath( "//div[@id='amendments']/table//tr") for amendment in amendments: nodes = amendment.xpath("./td") if nodes == []: continue pdf_href, date, committee, adopted, when = nodes pdf_href, = pdf_href.xpath("./a") pdf_link = pdf_href.attrib['href'] name = "Ammendment %s" % (pdf_href.text_content()) adopted = adopted.text bill.add_document(name=name, url=pdf_link, adopted=adopted, mimetype='application/pdf') bill.add_source(a.get('href')) self.save_bill(bill)
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split("\n") lines = [ line.strip().replace("–", "-").replace("―", '"').replace( "‖", '"').replace('“', '"').replace('”', '"') for line in lines ] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter([ line for line in lines if not (line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line)) ]) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or \ "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)' # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r'.* the vote was:\s*' # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + lines.next() try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning( "This motion did not pertain to legislation: {}".format( line)) continue # Get the motion text motion_re = r''' ^On\sthe\squestion\s # Precedes any motion " # Motion is preceded by a quote mark (Shall\s.+?\??) # The motion text begins with "Shall" \s*"\s+ # Motion is followed by a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ '''.format( bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '') motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE).group(1) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes, passed = self.parse_votes(lines) #at the very least, there should be a majority #for the bill to have passed, so check that, #but if the bill didn't pass, it could still be OK if it got a majority #eg constitutional amendments assert (passed == (votes['yes_count'] > votes['no_count'])) or (not passed) #also throw a warning if the bill failed but got a majority #it could be OK, but is probably something we'd want to check if not passed and votes['yes_count'] > votes['no_count']: self.logger.warning( "The bill got a majority but did not pass. Could be worth confirming." ) vote = Vote(motion=motion, passed=passed, chamber=chamber, date=date, session=session, bill_id=bill_id, bill_chamber=bill_chamber, **votes) vote.update(votes) vote.add_source(url) self.save_vote(vote)
def scrape(self, chamber, session): self.all_bills = {} self.slug = self.metadata['session_details'][session]['slug'] page = self.lxmlize(self.bill_directory_url.format(self.slug.upper())) page.make_links_absolute(self.base_url) ulid = 'senateBills' if chamber == 'upper' else 'houseBills' # id of <ul> header = page.xpath("//ul[@id='{0}_search']".format(ulid))[0] #Every ul with a data-load-action and an id bill_list_pages = header.xpath(".//ul[boolean(@data-load-action)" " and boolean(@id)]/@data-load-action") bill_anchors = [] for bill_list_url in bill_list_pages: bill_list_page = self.lxmlize('{}{}'.format( self.base_url, bill_list_url)) bill_list_page.make_links_absolute(self.base_url) bill_anchors.extend(bill_list_page.xpath('//a') or []) ws = re.compile(r"\s+") def _clean_ws(txt): """Remove extra whitespace from text.""" return ws.sub(' ', txt).strip() for a in bill_anchors: bid = ws.sub('', a.text_content()) # bill id bill_summary = _clean_ws(a.get('title')) # bill title is added below bill = Bill(session, chamber, bid, title='', summary=bill_summary) page = self.lxmlize(a.get('href')) versions = page.xpath('//ul[@class="dropdown-menu"]/li/span/' + 'a[contains(@title, "Get the Pdf")]/@href') measure_info = {} info = page.xpath("//table[@id='measureOverviewTable']/tr") for row in info: key, value = row.xpath("./*") key = key.text.replace(':', '').strip() measure_info[key] = value for sponsor in measure_info['Chief Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor(type='primary', name=sponsor.text_content()) for sponsor in measure_info['Regular Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor(type='cosponsor', name=sponsor.text_content()) title = _clean_ws(measure_info['Bill Title'].text_content()) # some bill titles need to be added manually if self.slug == "2013R1" and bid == "HB2010": title = ("Relating to Water Resources Department contested" "case proceedings.") bill['title'] = title for version in versions: name = version.split("/")[-1] bill.add_version(name=name, url=version, mimetype='application/pdf') history_url = self.create_url( 'Measures/Overview/GetHistory/{bill}', bid) history = self.lxmlize(history_url).xpath("//table/tr") for entry in history: wwhere, action = [ _clean_ws(x.text_content()) for x in entry.xpath("*") ] vote_cleaning_re = r'(.*?)((Ayes)|(Nays),\s.*)' if re.match(vote_cleaning_re, action): action = re.search(vote_cleaning_re, action).groups()[0] wwhere = re.match(r"(?P<when>.*) \((?P<where>.*)\)", wwhere).groupdict() action_chamber = {"S": "upper", "H": "lower"}[wwhere['where']] when = "%s-%s" % (self.slug[:4], wwhere['when']) when = dt.datetime.strptime(when, "%Y-%m-%d") types = [] for expr, types_ in self.action_classifiers: m = re.match(expr, action) if m: types += types_ if types == []: types = ['other'] # actor, action, date, type, committees, legislators bill.add_action(action_chamber, action, when, type=types) # Parse and store Vote information vote_id = entry.xpath('./td/a[contains(@href, "otes-")]/@href') if not vote_id: continue elif "#measureVotes-" in vote_id[0]: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/MeasureVotes?id={1}". \ format(self.slug, vote_id) else: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/CommitteeReports/MajorityReport/{1}". \ format(self.slug, vote_id) votes = self._get_votes(vote_url) if not any(len(x) for x in votes.values()): self.warning("The votes webpage was empty for " + "action {0} on bill {1}.".format(action, bid)) continue passed = (float(len(votes["yes_votes"])) / (len(votes["yes_votes"]) + len(votes["no_votes"])) > 0.5) vote = Vote(chamber=chamber, date=when, motion=action, passed=passed, yes_count=len(votes["yes_votes"]), no_count=len(votes["no_votes"]), other_count=len(votes["other_votes"]), session=session, bill_id=bid, bill_chamber=action_chamber) vote.update(votes) bill_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/Overview/{1}".format(self.slug, bid) vote.add_source(bill_url) bill.add_vote(vote) amendments_url = self.create_url( 'Measures/ProposedAmendments/{bill}', bid) amendments = self.lxmlize(amendments_url).xpath( "//div[@id='amendments']/table//tr") for amendment in amendments: nodes = amendment.xpath("./td") if nodes == []: continue pdf_href, date, committee, adopted, when = nodes pdf_href, = pdf_href.xpath("./a") pdf_link = pdf_href.attrib['href'] name = "Ammendment %s" % (pdf_href.text_content()) adopted = adopted.text bill.add_document(name=name, url=pdf_link, adopted=adopted, mimetype='application/pdf') bill.add_source(a.get('href')) self.save_bill(bill)
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) xml = convert_pdf(filename) try: et = lxml.etree.fromstring(xml) except lxml.etree.XMLSyntaxError: self.logger.warning('Skipping invalid pdf: %r' % filename) return lines = self._journal_lines(et) while True: try: line = next(lines) except StopIteration: break text = gettext(line) # Go through with vote parse if any of # these conditions match. if 'Shall' in text: if 'bill pass?' in text: pass elif 'resolution' in text: pass elif 'amendment' in text: pass else: continue else: continue # Get the bill_id. bill_id = None for line in lines: text += gettext(line) m = re.search(r'\(\s*([A-Z\.]+\s+\d+)\s*\)', text) if m: bill_id = m.group(1) break motion = text.strip() motion = re.sub(r'\s+', ' ', motion) if "(" in motion: motion, _ = motion.rsplit('(', 1) motion = motion.replace('"', '') motion = motion.replace(u'“', '') motion = motion.replace(u'\u201d', '') motion = motion.replace(u' ,', ',') motion = motion.strip() motion = re.sub(r'[SH].\d+', lambda m: ' %s ' % m.group(), motion) motion = re.sub(r'On the question\s*', '', motion, flags=re.I) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes = self.parse_votes(lines) totals = filter(lambda x: isinstance(x, int), votes.values()) passed = (1.0 * votes['yes_count'] / sum(totals)) >= 0.5 vote = Vote(motion=motion, passed=passed, chamber=chamber, date=date, session=session, bill_id=bill_id, bill_chamber=bill_chamber, **votes) vote.update(votes) vote.add_source(url) self.save_vote(vote)