def _scrape_solo_bills(options, metadata): _clear_scraped_data(options.output_dir, 'bills') scraper = _get_configured_scraper('bills', options, metadata) if len(options.chambers) == 1: chamber = options.chambers[0] else: raise ScrapeError('must specify --chamber when providing a --bill') if len(options.sessions): session = list(options.sessions)[0] else: raise ScrapeError('must specify --session when providing a --bill') for bill_id in options.solo_bills: scraper.scrape_bill(chamber, session, bill_id)
def parse_exec_date(date_str): """ Parse dates for executive actions. """ match = re.search(r'((\w+) (\d{1,2}),\s?(\d{4,4}))', date_str) if match: date_str = "%s %s, %s" % (match.group(2), match.group(3), match.group(4)) return datetime.datetime.strptime(date_str, "%B %d, %Y") match = re.search(r'((\w+), (\d{1,2}),\s?(\d{4,4}))', date_str) if match: date_str = "%s, %s, %s" % (match.group(2), match.group(3), match.group(4)) return datetime.datetime.strptime(date_str, "%B, %d, %Y") match = re.search(r'(\d{1,2}/\d{1,2}/\d{4,4})', date_str) if match: return datetime.datetime.strptime(match.group(1), "%m/%d/%Y") match = re.search(r'(\d{1,2}/\d{1,2}/\d{2,2})', date_str) if match: return datetime.datetime.strptime(match.group(1), "%m/%d/%y") raise ScrapeError("Invalid executive action date: %s" % date_str)
def _run_scraper(scraper_type, options, metadata): """ scraper_type: bills, legislators, committees, votes """ _clear_scraped_data(options.output_dir, scraper_type) if scraper_type == 'speeches': _clear_scraped_data(options.output_dir, 'events') scraper = _get_configured_scraper(scraper_type, options, metadata) ua_email = os.environ.get('BILLY_UA_EMAIL') if ua_email and scraper: scraper.user_agent += ' ({})'.format(ua_email) if not scraper: return [{ "type": scraper_type, "start_time": dt.datetime.utcnow(), "noscraper": True, "end_time": dt.datetime.utcnow() }] runs = [] # Removed from the inner loop due to non-bicameral scrapers scrape = {"type": scraper_type} scrape['start_time'] = dt.datetime.utcnow() if scraper_type in ('bills', 'votes', 'events', 'speeches'): times = options.sessions for time in times: scraper.validate_session(time, scraper.latest_only) elif scraper_type in ('committees', 'legislators'): times = options.terms for time in times: scraper.validate_term(time, scraper.latest_only) # run scraper against year/session/term for time in times: # old style chambers = options.chambers if scraper_type == 'events' and len(options.chambers) == 2: chambers.append('other') if _is_old_scrape(scraper.scrape): for chamber in chambers: scraper.scrape(chamber, time) else: scraper.scrape(time, chambers=chambers) # error out if events or votes don't scrape anything if not scraper.object_count and scraper_type not in ('events', 'votes'): raise ScrapeError("%s scraper didn't save any objects" % scraper_type) scrape['end_time'] = dt.datetime.utcnow() runs.append(scrape) return runs
def scrape_bill(self, bill, url): with self.urlopen(url) as page: page.replace(' ', ' ') page = lxml.html.fromstring(page) page.make_links_absolute(url) bill.add_source(url) for link in page.xpath("//b[text()='Bill Text:']/" "following-sibling::blockquote/a"): bill.add_version(link.text, link.attrib['href']) more_sponsor_link = page.xpath("//a[text()='More Sponsors']") if page.xpath("//a[text()='More Sponsors']"): sponsor_url = more_sponsor_link[0].attrib['href'] self.scrape_sponsors(bill, sponsor_url) else: for b in page.xpath("//td[text()='Sponsor(s):']/../td[2]/b"): bill.add_sponsor("sponsor", b.text) for tr in page.xpath(""" //b[text()='Detailed Status:']/ following-sibling::blockquote[1]/table/tr""")[1:]: action = tr.xpath("string(td[3])").strip() match = re.search('(to|by) Governor on (.*)', action) if match: date = parse_exec_date(match.group(2).strip()).date() actor = 'executive' else: if tr.attrib['bgcolor'] == 'Salmon': actor = 'lower' elif tr.attrib['bgcolor'] == 'LightGreen': actor = 'upper' else: raise ScrapeError("Invalid row color: %s" % tr.attrib['bgcolor']) date = tr.xpath("string(td[1])") try: date = re.search( r"\d\d?/\d\d?/\d{4,4}", date).group(0) except AttributeError: # No date, skip self.warning("skipping action '%s -- %s'" % ( date, action)) continue date = datetime.datetime.strptime(date, "%m/%d/%Y") date = date.date() bill.add_action(actor, action, date, type=action_type(action)) for vote_link in tr.xpath("td[4]/a"): self.scrape_vote(bill, actor, vote_link.attrib['href']) self.save_bill(bill)
def scrape_votes_old(self, bill, billname, session): vote_url = ('http://archives.legislature.state.oh.us/bills.cfm?ID=' + session + '_' + billname) page = self.get(vote_url).text page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = datetime.datetime.strptime(jlink.text, "%m/%d/%Y").date() details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == 'House': chamber = 'lower' elif chamber == 'Senate': chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath( "td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath( "td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = Vote(chamber, date, motion, yes_count > no_count, yes_count, no_count, 0) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) bill.add_vote(vote)
def scrape_events(self, chamber, session, event_id): url = '%s%s' % (self.upper_url, event_id) html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) rows = doc.xpath("//div[@id='WebPartWPQ2']") #some ids are empty if len(rows): table_data = rows[0].find('table')[1] for link in table_data.iterchildren('td'): td = link.xpath('//td[@class="ms-formbody"]') description = td[18].text when = td[19].text where = td[25].text type = td[27].text meeting_lead = td[28].text when = datetime.datetime.strptime(when, "%m/%d/%Y %H:%M %p") when = self._tz.localize(when) event_type = 'committee:meeting' kwargs = {"location": "State House"} if where is not None and where != "": kwargs['location'] = where event = Event(session, when, event_type, description, **kwargs) if td[20].text is None: participants = meeting_lead else: participants = td[20].text.split(';') if participants: for participant in participants: name = participant.strip().replace('HON.', '', 1) if name != "": event.add_participant('committee', name, 'committee', chamber=chamber) event.add_source(url) self.save_event(event) else: #hack so we dont fail on the first id numbers where there are some gaps between the numbers that work and not. if event_id > 1700: raise ScrapeError( "Parsing is done we are on future ids that are not used yet." )
def scrape(self, chamber, session): # check for abiword if os.system('which abiword') != 0: raise ScrapeError('abiword is required for PR scraping') year = session[0:4] self.base_url = 'http://www.oslpr.org/legislatura/tl%s/tl_medida_print2.asp' % year chamber_letter = {'lower': 'C', 'upper': 'S'}[chamber] for code, type in self.bill_types.iteritems(): counter = itertools.count(1) for n in counter: bill_id = '%s%s%s' % (code, chamber_letter, n) try: self.scrape_bill(chamber, session, bill_id, type) except NoSuchBill: break
def parse_exec_date(date_str): """ Parse dates for executive actions. """ match = re.search('(\w+ \d{1,2}, \d{4,4})', date_str) if match: return datetime.datetime.strptime(match.group(1), "%B %d, %Y") match = re.search('(\d{1,2}/\d{1,2}/\d{4,4})', date_str) if match: return datetime.datetime.strptime(match.group(1), "%m/%d/%Y") match = re.search('(\d{1,2}/\d{1,2}/\d{2,2})', date_str) if match: return datetime.datetime.strptime(match.group(1), "%m/%d/%y") raise ScrapeError("Invalid executive action date: %s" % date_str)
def recap(self, data): """Extract bill ids from daily recap page. Splits page into sections, and returns list containing bill ids """ # throw away everything before <body> start = data.index("<body>") stop = data.index("</body>", start) bill_id_exp = re.compile(">(?P<id>\w\. \d{1,4}?)</a> \(<a href=") billids = set() if stop >= 0 and stop > start: all = re.compile("/cgi-bin/web_bh10.exe").split(data[start:stop]) for part in all[1:]: result = bill_id_exp.search(part) if result: bill_id = result.group('id') billids.add(bill_id) return billids raise ScrapeError("recap: bad format %s" % data)
def parse_bill_xml(self, chamber, session, txt): root = lxml.etree.fromstring(txt) bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) bill_title = root.findtext("caption") if session[2] == 'R': session = session[0:2] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = {'H': 'lower', 'S': 'upper', 'E': 'executive'}[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': continue if desc == 'Amended': atype = 'amendment:passed' elif desc == 'Amendment(s) offered': atype = 'amendment:introduced' elif desc == 'Amendment amended': atype = 'amendment:amended' elif desc == 'Amendment withdrawn': atype = 'amendment:withdrawn' elif desc == 'Passed' or desc == 'Adopted': atype = 'bill:passed' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'bill:introduced' else: atype = 'other' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'governor:received' elif desc.startswith('Signed by the Governor'): atype = 'governor:signed' elif desc == 'Read first time': atype = ['bill:introduced', 'bill:reading:1'] introduced = True elif desc == 'Read & adopted': atype = 'bill:passed' elif desc.startswith('Referred to'): atype = 'committee:referred' elif desc == "Filed": atype = 'bill:filed' else: atype = 'other' bill.add_action(actor, action.findtext('description'), act_date, type=atype, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('author', author) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('coauthor', coauthor) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('sponsor', sponsor) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) return bill
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('-s', '--session', action='append', dest='sessions', help='session(s) to scrape') parser.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape') parser.add_argument('--upper', action='store_true', dest='upper', default=False, help='scrape upper chamber') parser.add_argument('--lower', action='store_true', dest='lower', default=False, help='scrape lower chamber') parser.add_argument('--bills', action='store_true', dest='bills', default=False, help="scrape bill data") parser.add_argument('--legislators', action='store_true', dest='legislators', default=False, help="scrape legislator data") parser.add_argument('--committees', action='store_true', dest='committees', default=False, help="scrape committee data") parser.add_argument('--votes', action='store_true', dest='votes', default=False, help="scrape vote data") parser.add_argument('--events', action='store_true', dest='events', default=False, help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="scrape all available types of data") parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--timeout', action='store', type=int, dest='timeout', default=10) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] configure_logging(args.verbose, args.state) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) try: os.makedirs(args.output_dir) except OSError as e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or []) # determine chambers args.chambers = [] if args.upper: args.chambers.append('upper') if args.lower: args.chambers.append('lower') if not args.chambers: args.chambers = ['upper', 'lower'] if not (args.bills or args.legislators or args.votes or args.committees or args.events or args.alldata): raise ScrapeError("Must specify at least one of --bills, " "--legislators, --committees, --votes, --events, " "--alldata") if args.alldata: args.bills = True args.legislators = True args.votes = True args.committees = True if args.bills: _run_scraper(args.state, state, 'bills', args, metadata) if args.legislators: _run_scraper(args.state, state, 'legislators', args, metadata) if args.committees: _run_scraper(args.state, state, 'committees', args, metadata) if args.votes: _run_scraper(args.state, state, 'votes', args, metadata) if args.events: _run_scraper(args.state, state, 'events', args, metadata)
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).content root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if (bill_title is None or "Bill does not exist" in history_xml): self.warning("Bill does not appear to exist") return bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(history_url) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version(name=self.NAME_SLUGS[version[1][-5]], url=version[1], mimetype='text/html') analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document(name="Analysis ({})".format( self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], mimetype='text/html') fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document(name="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], mimetype='text/html') witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document(name="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], mimetype='text/html') for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': self.warning("Skipping public hearing action with no date") continue introduced = False if desc == 'Amended': atype = 'amendment:passed' elif desc == 'Amendment(s) offered': atype = 'amendment:introduced' elif desc == 'Amendment amended': atype = 'amendment:amended' elif desc == 'Amendment withdrawn': atype = 'amendment:withdrawn' elif desc == 'Passed' or desc == 'Adopted': atype = 'bill:passed' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'bill:introduced' else: atype = 'bill:filed' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'governor:received' elif desc.startswith('Signed by the Governor'): atype = 'governor:signed' elif desc == 'Vetoed by the Governor': atype = 'governor:vetoed' elif desc == 'Read first time': atype = ['bill:introduced', 'bill:reading:1'] introduced = True elif desc == 'Read & adopted': atype = ['bill:passed'] if not introduced: introduced = True atype.append('bill:introduced') elif desc == "Passed as amended": atype = 'bill:passed' elif (desc.startswith('Referred to') or desc.startswith("Recommended to be sent to ")): atype = 'committee:referred' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee:passed' elif desc == "Filed": atype = 'bill:filed' elif desc == 'Read 3rd time': atype = 'bill:reading:3' elif desc == 'Read 2nd time': atype = 'bill:reading:2' elif desc.startswith('Reported favorably'): atype = 'committee:passed:favorable' else: atype = 'other' if 'committee:referred' in atype: repls = ['Referred to', "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() extra['committees'] = ctty bill.add_action(actor, action.findtext('description'), act_date, type=atype, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('primary', author, official_type='author') for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('cosponsor', coauthor, official_type='coauthor') for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('primary', sponsor, official_type='sponsor') for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor, official_type='cosponsor') self.save_bill(bill)
def scrape_bill(self, bill, url): page = self.urlopen(url) page.replace(' ', ' ') page = lxml.html.fromstring(page) page.make_links_absolute(url) bill.add_source(url) for link in page.xpath("//b[text()='Bill Text:']/" "following-sibling::blockquote[1]//a"): if link.attrib['href'].endswith('pdf'): mimetype = 'application/pdf' elif link.attrib['href'].endswith('htm'): mimetype = 'text/html' bill.add_version(link.text, link.attrib['href'], mimetype=mimetype) more_sponsor_link = page.xpath("//a[text()='More Sponsors']") if page.xpath("//a[text()='More Sponsors']"): sponsor_url = more_sponsor_link[0].attrib['href'] self.scrape_sponsors(bill, sponsor_url) else: for b in page.xpath("//td[text()='Sponsor(s):']/../td[2]/b"): bill.add_sponsor("primary", b.text) for tr in page.xpath(""" //b[text()='Detailed Status:']/ following-sibling::blockquote[1]/table/tr""")[1:]: action = tr.xpath("string(td[3])").strip() match = re.search('(to|by) Governor on (.*)', action) if match: date = parse_exec_date(match.group(2).strip()).date() actor = 'executive' else: if tr.attrib['bgcolor'] == 'Salmon': actor = 'lower' elif tr.attrib['bgcolor'] == 'LightGreen': actor = 'upper' else: raise ScrapeError("Invalid row color: %s" % tr.attrib['bgcolor']) date = tr.xpath("string(td[1])") try: date = re.search(r"\d\d?/\d\d?/\d{4,4}", date).group(0) except AttributeError: # No date, skip self.warning("skipping action '%s -- %s'" % (date, action)) continue date = datetime.datetime.strptime(date, "%m/%d/%Y") date = date.date() types, attrs = self.categorizer.categorize(action) action = dict(actor=actor, action=action, date=date, type=types) action.update(**attrs) bill.add_action(**action) for vote_link in tr.xpath("td[4]/a"): self.scrape_vote(bill, actor, vote_link.attrib['href']) # If nearly all of the bill attributes but the title are blank, this is a bad bill. # See Issue #166. if all( len(bill[x]) == 0 for x in ('votes', 'alternate_titles', 'sponsors', 'actions', 'versions', 'documents')): return False self.save_bill(bill) return True
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(type, value, tb): traceback.print_exception(type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)
def scrape(self, chamber, session): # URL building if chamber == 'upper': url_chamber_name = 'senate' norm_chamber_name = 'Senate' else: url_chamber_name = 'house' norm_chamber_name = 'House' assembly_url = '/assembly/%s' % session chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name) list_url = self.site_root + assembly_url + chamber_url # Parsing with self.urlopen(list_url) as data: soup = self.parser.parse(data) if not soup: raise ScrapeError('Failed to parse legaslative list page.') table = soup.find('table', summary=norm_chamber_name + ' Bills') bill_links = table.findAll('a', href=re.compile('bill-actions')) indexed_bills = {} for link in bill_links: # Populate base attributes attributes = { 'session': session, 'chamber': chamber, } bill_number = link.contents[0] if not re.match('^[0-9]{4}$', bill_number): raise ScrapeError('Bill number not in expected format.') # ND bill prefixes are coded numerically if bill_number[0] == '1': bill_prefix = 'HB' elif bill_number[0] == '2': bill_prefix = 'SB' elif bill_number[0] == '3': bill_prefix = 'HCR' elif bill_number[0] == '4': bill_prefix = 'SCR' elif bill_number[0] == '5': bill_prefix = 'HR' elif bill_number[0] == '6': bill_prefix = 'SR' elif bill_number[0] == '7': bill_prefix = 'HMR' elif bill_number[0] == '8': bill_prefix = 'SMR' attributes['bill_id'] = bill_prefix + ' ' + bill_number # Skip duplicates (bill is listed once for each version) if attributes['bill_id'] in indexed_bills.keys(): continue self.debug(attributes['bill_id']) # Parse details page attributes.update( self.scrape_bill_details(assembly_url, bill_number)) # Create bill bill = Bill(**attributes) # Parse actions (actions, actions_url) = self.scrape_bill_actions( assembly_url, bill_number, session) bill.add_source(actions_url) for action in actions: bill.add_action(**action) # Parse versions (versions, versions_url) = self.scrape_bill_versions( assembly_url, bill_number) bill.add_source(versions_url) for version in versions: bill.add_version(**version) # Add bill to dictionary, indexed by its id indexed_bills[attributes['bill_id']] = bill # Parse sponsorship data (sponsors, sponsors_url) = self.scrape_bill_sponsors(assembly_url) for bill_id, sponsor_list in sponsors.items(): for sponsor in sponsor_list: # Its possible a bill was misnamed somewhere... but thats # not a good enough reason to error out if bill_id in indexed_bills.keys(): bill = indexed_bills[bill_id] bill.add_sponsor(**sponsor) bill.add_source(sponsors_url) # Save bill for bill in indexed_bills.values(): self.save_bill(bill)
def get_page( self, url ): with self.urlopen(url) as html: page = lxml.html.fromstring(html) return ( page, html ) raise ScrapeError("Error getting the page. Sorry, man.")
def scrape(self, chamber, year): """ Scrape the ND legislators seated in a given chamber during a given year. """ # Error checking if year not in metadata['session_details']: raise NoDataForPeriod(year) # No legislator data for 1997 (though other data is available) if year == '1997': raise NoDataForPeriod(year) # URL building if chamber == 'upper': url_chamber_name = 'senate' norm_chamber_name = 'Senate' url_member_name = 'senators' else: url_chamber_name = 'house' norm_chamber_name = 'House' url_member_name = 'representatives' assembly_url = '/assembly/%i-%s/%s' % (metadata['session_details'][str( year)]['number'], year, url_chamber_name) list_url = \ self.site_root + \ assembly_url + \ '/members/last-name.html' # Parsing with self.urlopen(list_url) as data: soup = self.parser.parse(data) if not soup: raise ScrapeError('Failed to parse legaslative list page.') header = soup.find('h2') if not header: raise ScrapeError('Legaslative list header element not found.') party_images = { '/images/donkey.gif': 'Democrat', '/images/elephant.gif': 'Republican' } for row in header.findNextSibling('table').findAll('tr'): cells = row.findAll('td') party = party_images[cells[0].img['src']] name = map(lambda x: x.strip(), cells[1].a.contents[0].split(', ')) name.reverse() name = ' '.join(name) district = re.findall('District (\d+)', cells[2].contents[0])[0] attributes = { 'session': year, 'chamber': chamber, 'district': district, 'party': party, 'full_name': name, } split_name = name.split(' ') if len(split_name) > 2: attributes['first_name'] = split_name[0] attributes['middle_name'] = split_name[1].strip(' .') attributes['last_name'] = split_name[2] else: attributes['first_name'] = split_name[0] attributes['middle_name'] = u'' attributes['last_name'] = split_name[1] # we can get some more data.. bio_url = self.site_root + cells[1].a['href'] try: attributes.update(self.scrape_legislator_bio(bio_url)) except urllib2.HTTPError: self.log("failed to fetch %s" % bio_url) self.debug("attributes: %d", len(attributes)) self.debug(attributes) # Save legislator = Legislator(**attributes) legislator.add_source(bio_url) self.save_legislator(legislator)
def parse_bill_xml(self, chamber, session, txt): root = lxml.etree.fromstring(txt.bytes) bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) bill_title = root.findtext("caption") if session[2] == 'R': session = session[0:2] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) versions = root.xpath("//versions") for version in versions: versionz = version.xpath(".//version") for v in versionz: description = v.xpath(".//versionDescription")[0].text html_url = v.xpath(".//WebHTMLURL")[0].text bill.add_version(description, html_url, 'text/html') for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': continue introduced = False if desc == 'Amended': atype = 'amendment:passed' elif desc == 'Amendment(s) offered': atype = 'amendment:introduced' elif desc == 'Amendment amended': atype = 'amendment:amended' elif desc == 'Amendment withdrawn': atype = 'amendment:withdrawn' elif desc == 'Passed' or desc == 'Adopted': atype = 'bill:passed' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'bill:introduced' else: atype = 'bill:filed' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'governor:received' elif desc.startswith('Signed by the Governor'): atype = 'governor:signed' elif desc == 'Vetoed by the Governor': atype = 'governor:vetoed' elif desc == 'Read first time': atype = ['bill:introduced', 'bill:reading:1'] introduced = True elif desc == 'Read & adopted': atype = ['bill:passed'] if not introduced: introduced = True atype.append('bill:introduced') elif desc == "Passed as amended": atype = 'bill:passed' elif desc.startswith('Referred to') or desc.startswith( "Recommended to be sent to "): atype = 'committee:referred' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee:passed' elif desc == "Filed": atype = 'bill:filed' elif desc == 'Read 3rd time': atype = 'bill:reading:3' elif desc == 'Read 2nd time': atype = 'bill:reading:2' elif desc.startswith('Reported favorably'): atype = 'committee:passed:favorable' else: atype = 'other' if 'committee:referred' in atype: repls = ['Referred to', "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() extra['committee'] = ctty bill.add_action(actor, action.findtext('description'), act_date, type=atype, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('author', author) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('coauthor', coauthor) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('sponsor', sponsor) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) return bill
def scrape_details(self, bill_detail_url, session, chamber, bill_id, page): data = page pat1 = re.compile(r'</FORM>') results = pat1.search(data) if not results: raise ScrapeError("scrape_details(1) - unable to parse |%s|" % bill_detail_url) pre_start = page.find("<pre>", results.start()) if pre_start == -1: self.warning( "scrape_details(2) - unable to parse (no <pre>) |%s|\n|%s|" % (bill_detail_url, page)) return pre_stop = page.find("</pre>", pre_start) if pre_stop == -1: raise ScrapeError( "scrape_details(3) - unable to parse (no </pre>) %s" % bill_detail_url) pre_section = page[pre_start:pre_stop] data = pre_section vurl = None action_line_re = re.compile(r'(\d\d/\d\d/\d\d)\s+(\w+)\s+(.+)') pat2 = re.compile(r' By ') results = pat2.search(data) if results != None: bystuff = data[results.start():results.end()] data = data[results.end():] pat3 = re.compile(r'</b>') results1 = pat3.search(data) newspon = [] if results != None and results1 != None: spondata = data[:results1.start()] mysponsors = sponsorsToList(spondata) for s in mysponsors: newspon.append(s) data = data[results1.end():] apat = re.compile(">(H|S) (\d*)<") billpat = re.compile("(\d+)") bill_number = billpat.search(bill_id).group(0) (similar_bills, summary, after_summary, vurl) = self.split_page_into_parts(data, session, bill_number) bill_summary = summary.strip().decode('utf8', 'ignore') bill = Bill(session, chamber, bill_id, bill_summary, type=bill_type(bill_summary)) linenum = 0 for line in after_summary.splitlines(): #get rid of the parenthesis action_line = line.partition("(")[0].strip() #r1 = action_line_re.search(action_line) r = action_line_re.search(action_line) if r: the_date = r.group(1) action_chamber = r.group(2) action = r.group(3) date = datetime.datetime.strptime(the_date, "%m/%d/%y") date = date.date() t = action_type(action) if t == ['other']: self.debug("OTHERACTION: bill %s %d Text[%s] line[%s]" % (bill_id, linenum, action, line)) else: self.debug("ACTION: %s %d dt|ch|action [%s|%s|%s] [%s]" % (bill_id, linenum, the_date, action_chamber, action, str(t))) bill.add_action(chamber, action, date, t) elif len(line) > 0: self.debug("Skipping line %d [%s] line:[%s]" % (linenum, bill_id, line)) linenum += 1 if similar_bills: bill['similar'] = similar_bills bill.add_source(bill_detail_url) for sponsor in newspon: bill.add_sponsor("sponsor", sponsor) if vurl: try: self.scrape_vote_history(vurl, chamber, bill, bill_id) bill.add_source(vurl) self.debug("Scraped votes: (chamber=%s,bill=%s,url=%s)" % (chamber, bill_id, vurl)) except Exception as error: self.warning( "Failed to scrape votes: chamber=%s bill=%s vurl=%s %s" % (chamber, bill_id, vurl, traceback.format_exc())) self.save_bill(bill)
def scrape_bill(self, bill_url, chamber, session): with self.urlopen(bill_url) as text: if "Specified Bill could not be found" in text: return False page = lxml.html.fromstring(text) page.make_links_absolute(bill_url) bill_id = page.xpath("string(//h2)").split()[0] summary = page.xpath( "string(//*[starts-with(text(), 'Summary: ')])") summary = summary.replace('Summary: ', '') match = re.match( r"^([^:]+): " r"((\(Constitutional [aA]mendment\) )?[^(]+)", summary) if match: subjects = [match.group(1).strip()] title = match.group(2).strip() else: raise ScrapeError("Bad title") if bill_id.startswith('SB') or bill_id.startswith('HB'): bill_type = ['bill'] elif bill_id.startswith('SR') or bill_id.startswith('HR'): bill_type = ['resolution'] elif bill_id.startswith('SCR') or bill_id.startswith('HCR'): bill_type = ['concurrent resolution'] else: raise ScrapeError("Invalid bill ID format: %s" % bill_id) if title.startswith("(Constitutional Amendment)"): bill_type.append('constitutional amendment') title = title.replace('(Constitutional Amendment) ', '') bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type) bill.add_source(bill_url) history_link = page.xpath("//a[text() = 'History']")[0] history_url = history_link.attrib['href'] self.scrape_history(bill, history_url) authors_link = page.xpath("//a[text() = 'Authors']")[0] authors_url = authors_link.attrib['href'] self.scrape_authors(bill, authors_url) try: versions_link = page.xpath( "//a[text() = 'Text - All Versions']")[0] versions_url = versions_link.attrib['href'] self.scrape_versions(bill, versions_url) for doc in ["Notes", "Digest", "Amendments", "Misc"]: doc_link = page.xpath("//a[text() = '%s']" % doc)[0] doc_url = doc_link.attrib['href'] self.scrape_documents(bill, doc_url) except IndexError: # Only current version try: version_link = page.xpath( "//a[text() = 'Text - Current']")[0] version_url = version_link.attrib['href'] bill.add_version("%s Current" % bill_id, version_url, on_duplicate="use_old") except IndexError: # Some bills don't have any versions :( pass try: votes_link = page.xpath("//a[text() = 'Votes']")[0] self.scrape_votes(bill, votes_link.attrib['href']) except IndexError: # Some bills don't have any votes pass self.save_bill(bill) return True
def scrape_vote(self, bill, date, url): page = self.urlopen(url) page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) committee = ' '.join(location.split(' ')[1:]).strip() if not committee or committee.startswith('of Representatives'): committee = None motion = ', '.join(header.split(', ')[2:]).strip() if not motion: # If we can't detect a motion, skip this vote return yes_count = int(page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) other_count = excused_count + absent_count passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['type'] = type if committee: vote['committee'] = committee vote.add_source(url) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text in ('Excused', 'Absent'): vote.other(td.getprevious().text.strip()) bill.add_vote(vote)