def scan_votes(abbr): sessions = defaultdict(_vote_report_dict) # load exception data into sets of ids indexed by exception type quality_exceptions = get_quality_exceptions(abbr) for vote in db.votes.find({settings.LEVEL_FIELD: abbr}): session_d = sessions[vote['session']] session_d['vote_count'] += 1 if vote['passed']: session_d['_passed_vote_count'] += 1 session_d['votes_per_chamber'][vote['chamber']] += 1 if not vote.get('type'): logger.warning('vote %s missing type' % vote['_id']) continue session_d['votes_per_type'][vote.get('type')] += 1 if not vote.get('date'): logger.warning('vote %s missing date' % vote['_id']) continue session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1 # roll calls has_rollcalls = False for rc in (vote['yes_votes'] + vote['no_votes'] + vote['other_votes']): has_rollcalls = True session_d['_rollcall_count'] += 1 if rc.get('leg_id'): session_d['_rollcalls_with_leg_id_count'] += 1 else: # keep missing leg_ids session_d['unmatched_voters'].add( (term_for_session(abbr, vote['session']), vote['chamber'], rc['name']) ) # check counts if any rollcalls are present if has_rollcalls: if (len(vote['yes_votes']) != vote['yes_count'] and vote['vote_id'] not in quality_exceptions['votes:bad_yes_count']): session_d['bad_vote_counts'].add(vote['vote_id']) if (len(vote['no_votes']) != vote['no_count'] and vote['vote_id'] not in quality_exceptions['votes:bad_no_count']): session_d['bad_vote_counts'].add(vote['vote_id']) if (len(vote['other_votes']) != vote['other_count'] and vote['vote_id'] not in quality_exceptions['votes:bad_other_count']): session_d['bad_vote_counts'].add(vote['vote_id']) # do logging of unnecessary exceptions for qe_type, qes in quality_exceptions.iteritems(): if qes: logger.warning('unnecessary {0} exceptions for {1} votes: \n {2}' .format(qe_type, len(qes), '\n '.join(qes))) return {'sessions': sessions}
def pagerank(state, session, chamber, d_factor=0.85): term = utils.term_for_session(state, session) leg_indexes = generate_leg_indexes(state, term, chamber) M = generate_adjacency_matrix(state, session, chamber, leg_indexes) size = len(leg_indexes) # Scale each column of the adjacency matrix for i in xrange(0, size): col_sum = M[:,i].sum() if col_sum: M[:,i] = M[:,i] / col_sum e = ((1.0 - d_factor) / size) * numpy.ones((size, size)) result = numpy.ones(size) / size for i in xrange(0, 100): result = numpy.dot(d_factor * M + e, result) result /= result.sum() for leg_id in leg_indexes.keys(): leg_indexes[leg_id] = result[leg_indexes[leg_id]] return leg_indexes
def __init__(self, scraper, session, chamber, details): (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, bill_id_parts) = details self.scraper = scraper self.session = session self.chamber = chamber self.data = {} self.bill = Bill(session, bill_chamber, bill_id, title, type=bill_type) self.term = term_for_session('ny', session) for data in self.metadata['terms']: if session in data['sessions']: self.termdata = data self.term_start_year = data['start_year'] self.assembly_url = assembly_url self.senate_url = senate_url self.bill_chamber = bill_chamber self.bill_type = bill_type self.bill_id = bill_id self.title = title self.letter, self.number, self.version = bill_id_parts self.urls = Urls(scraper=self.scraper, urls={ 'assembly': assembly_url, 'senate': senate_url })
def __init__(self, scraper, session, chamber, details): (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, bill_id_parts) = details self.scraper = scraper self.session = session self.chamber = chamber self.data = {} self.bill = Bill(session, bill_chamber, bill_id, title, type=bill_type) self.term = term_for_session('ny', session) for data in self.metadata['terms']: if session in data['sessions']: self.termdata = data self.term_start_year = data['start_year'] self.assembly_url = assembly_url self.senate_url = senate_url self.bill_chamber = bill_chamber self.bill_type = bill_type self.bill_id = bill_id self.title = title self.letter, self.number, self.version = bill_id_parts self.urls = Urls(scraper=self.scraper, urls={ 'assembly': assembly_url, 'senate': senate_url})
def scrape(self, session, chambers): term_id = term_for_session('ny', session) for term in self.metadata['terms']: if term['name'] == term_id: break self.term = term for billset in self.yield_grouped_versions(): self.scrape_bill(session, billset)
def scrape(self, session, chambers): term_id = term_for_session("ny", session) for term in self.metadata["terms"]: if term["name"] == term_id: break self.term = term for billset in self.yield_grouped_versions(): self.scrape_bill(session, billset)
def scan_votes(abbr): sessions = defaultdict(_vote_report_dict) # load exception data into sets of ids indexed by exception type quality_exceptions = get_quality_exceptions(abbr) for vote in db.votes.find({settings.LEVEL_FIELD: abbr}): session_d = sessions[vote['session']] session_d['vote_count'] += 1 if vote['passed']: session_d['_passed_vote_count'] += 1 session_d['votes_per_chamber'][vote['chamber']] += 1 if not vote.get('type'): logger.warning('vote %s missing type' % vote['_id']) continue session_d['votes_per_type'][vote.get('type')] += 1 if not vote.get('date'): logger.warning('vote %s missing date' % vote['_id']) continue session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1 # roll calls has_rollcalls = False for rc in (vote['yes_votes'] + vote['no_votes'] + vote['other_votes']): has_rollcalls = True session_d['_rollcall_count'] += 1 if rc.get('leg_id'): session_d['_rollcalls_with_leg_id_count'] += 1 else: # keep missing leg_ids session_d['unmatched_voters'].add( (term_for_session(abbr, vote['session']), vote['chamber'], rc['name'])) # check counts if any rollcalls are present if has_rollcalls: if (len(vote['yes_votes']) != vote['yes_count'] and vote['vote_id'] not in quality_exceptions['votes:bad_yes_count']): session_d['bad_vote_counts'].add(vote['vote_id']) if (len(vote['no_votes']) != vote['no_count'] and vote['vote_id'] not in quality_exceptions['votes:bad_no_count']): session_d['bad_vote_counts'].add(vote['vote_id']) if (len(vote['other_votes']) != vote['other_count'] and vote['vote_id'] not in quality_exceptions['votes:bad_other_count']): session_d['bad_vote_counts'].add(vote['vote_id']) # do logging of unnecessary exceptions for qe_type, qes in quality_exceptions.iteritems(): if qes: logger.warning( 'unnecessary {0} exceptions for {1} votes: \n {2}'.format( qe_type, len(qes), '\n '.join(qes))) return {'sessions': sessions}
def legislator_pagerank(abbr, session, chamber, d_factor=0.85): term = utils.term_for_session(abbr, session) leg_indexes = generate_leg_indexes(abbr, term, chamber) adjacency_matrix = generate_adjacency_matrix(abbr, session, chamber, leg_indexes) result = pagerank(adjacency_matrix, d_factor) for leg_id in leg_indexes.keys(): leg_indexes[leg_id] = result[leg_indexes[leg_id]] return leg_indexes
def legislator_pagerank(state, session, chamber, d_factor=0.85): term = utils.term_for_session(state, session) leg_indexes = generate_leg_indexes(state, term, chamber) adjacency_matrix = generate_adjacency_matrix(state, session, chamber, leg_indexes) result = pagerank(adjacency_matrix, d_factor) for leg_id in leg_indexes.keys(): leg_indexes[leg_id] = result[leg_indexes[leg_id]] return leg_indexes
def scrape(self, session, chambers): self.api_client = OpenLegislationAPIClient(self) term_id = term_for_session('ny', session) for term in reversed(self.metadata['terms']): if term['name'] == term_id: self.term_start_year = term['start_year'] break for bill in self._generate_bills(session): bill_object = self._scrape_bill(session, bill) self.save_bill(bill_object)
def __init__(self, scraper, session, bill, details): (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, bill_id_parts) = details self.bill = bill self.bill_id = bill_id # This works on the assumption that the metadata term ID is # only the start year. self.term_start_year = term_for_session('ny', session) self.letter, self.number, self.version = bill_id_parts self.shared_url = 'http://assembly.state.ny.us/leg/?default_fld='\ '&bn={}&term={}'.format(self.bill_id, self.term_start_year) self.urls = Urls(scraper=scraper, urls={ 'assembly': assembly_url, 'senate': senate_url})
def __init__(self, scraper, session, bill, details): (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, bill_id_parts) = details self.bill = bill self.bill_id = bill_id # This works on the assumption that the metadata term ID is # only the start year. self.term_start_year = term_for_session('ny', session) self.letter, self.number, self.version = bill_id_parts self.shared_url = 'http://assembly.state.ny.us/leg/?default_fld='\ '&bn={}&term={}'.format(self.bill_id, self.term_start_year) self.urls = Urls(scraper=scraper, urls={ 'assembly': assembly_url, 'senate': senate_url })
def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.session = session self.term = term_for_session("ny", session) for data in self.metadata["terms"]: if session in data["sessions"]: self.termdata = data self.term_start_year = data["start_year"] self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build()
def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.session = session self.term = term_for_session('ny', session) for data in self.metadata['terms']: if session in data['sessions']: self.termdata = data self.term_start_year = data['start_year'] self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build()
def import_bill(data, votes): level = data['level'] abbr = data[level] # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) vote_matcher = VoteMatcher(abbr) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = term_for_session(abbr, data['session']) # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) # update keywords data['_keywords'] = list(bill_keywords(data)) if not bill: insert_with_id(data) else: update(bill, data, db.bills)
def scan_bills(abbr): metadata = db.metadata.find_one({'_id': abbr}) level = metadata['level'] duplicate_sources = defaultdict(int) duplicate_versions = defaultdict(int) other_actions = defaultdict(int) uncategorized_subjects = defaultdict(int) sessions = defaultdict(_bill_report_dict) for bill in db.bills.find({'level': level, level: abbr}): session_d = sessions[bill['session']] # chamber count & bill_types if bill['chamber'] == 'lower': session_d['lower_count'] += 1 elif bill['chamber'] == 'upper': session_d['upper_count'] += 1 for type in bill['type']: session_d['bill_types'][type] += 1 update_common(bill, session_d) # actions last_date = datetime.datetime(1900, 1, 1) for action in bill['actions']: date = action['date'] if date < last_date: session_d['actions_unsorted'].add(bill['_id']) session_d['action_count'] += 1 for type in action['type']: session_d['actions_per_type'][type] += 1 if 'other' in action['type']: other_actions[action['action']] += 1 session_d['actions_per_actor'][action['actor']] += 1 session_d['actions_per_month'][date.strftime('%Y-%m')] += 1 if not bill['actions']: session_d['actionless_count'] += 1 # sponsors for sponsor in bill['sponsors']: session_d['_sponsor_count'] += 1 if sponsor.get('leg_id'): session_d['_sponsors_with_leg_id_count'] += 1 else: # keep missing leg_ids session_d['unmatched_leg_ids'].add( (term_for_session(abbr, bill['session']), bill['chamber'], sponsor['name'])) session_d['sponsors_per_type'][sponsor['type']] += 1 if not bill['sponsors']: session_d['sponsorless_count'] += 1 # votes for vote in bill['votes']: session_d['vote_count'] += 1 if vote['passed']: session_d['_passed_vote_count'] += 1 session_d['votes_per_chamber'][vote['chamber']] += 1 if not vote.get('type'): logger.warning('vote is missing type on %s' % bill['_id']) continue session_d['votes_per_type'][vote.get('type')] += 1 if not vote.get('date'): logger.warning('vote is missing date on %s' % bill['_id']) continue session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1 # roll calls has_rollcalls = False for rc in (vote['yes_votes'] + vote['no_votes'] + vote['other_votes']): has_rollcalls = True session_d['_rollcall_count'] += 1 if rc.get('leg_id'): session_d['_rollcalls_with_leg_id_count'] += 1 else: # keep missing leg_ids session_d['unmatched_leg_ids'].add( (term_for_session(abbr, bill['session']), vote['chamber'], rc['name'])) # check counts if any rollcalls are present if (has_rollcalls and (len(vote['yes_votes']) != vote['yes_count'] or len(vote['no_votes']) != vote['no_count'] or len(vote['other_votes']) != vote['other_count'])): session_d['bad_vote_counts'].add(bill['_id']) # subjects for subj in bill.get('scraped_subjects', []): uncategorized_subjects[subj] += 1 if bill.get('subjects'): session_d['_subjects_count'] += 1 for subject in bill['subjects']: session_d['bills_per_subject'][subject] += 1 # sources for source in bill['sources']: duplicate_sources[source['url']] += 1 # versions if not bill['versions']: # total num of bills w/o versions session_d['versionless_count'] += 1 else: # total num of versions session_d['version_count'] += len(bill['versions']) for doc in bill['versions']: duplicate_versions[doc['url']] += 1 # TODO: add a duplicate documents back in? dup_version_urls = [] dup_source_urls = [] for url, n in duplicate_versions.iteritems(): if n > 1: dup_version_urls.append(url) for url, n in duplicate_sources.iteritems(): if n > 1: dup_source_urls.append(url) return { 'duplicate_versions': dup_version_urls, 'duplicate_sources': dup_source_urls, 'other_actions': other_actions.items(), 'uncategorized_subjects': uncategorized_subjects.items(), 'sessions': sessions, }
def dump_missing_leg_ids(abbr, detailed=False): """ For a given abbr, find all of the sponsorships, votes and committee memberships which are missing legislator IDs and output them to CSV files. """ missing_csv = csv.writer(open('%s_missing_leg_ids.csv' % abbr, 'w')) missing_csv.writerow(('term', 'chamber', 'name')) missing = set() level = metadata(abbr)['level'] if detailed: sponsor_csv = csv.writer( open('%s_missing_sponsor_leg_ids.csv' % abbr, 'w')) sponsor_csv.writerow(("Abbreviation", "Session", "Chamber", "Bill ID", "Sponsor Type", "Legislator Name")) vote_csv = csv.writer(open("%s_missing_vote_leg_ids.csv" % abbr, 'w')) vote_csv.writerow( ("Abbreviation", "Session", "Chamber", "Bill ID", "Vote Index", "Vote Chamber", "Vote Motion", "Vote", "Name")) for bill in db.bills.find({'level': level, level: abbr}): for sponsor in bill['sponsors']: if not sponsor['leg_id']: missing.add( (term_for_session(abbr, bill['session']), bill['chamber'], sponsor['name'].encode('ascii', 'replace'))) if detailed: sponsor_csv.writerow( (abbr, bill['session'], bill['chamber'], bill['bill_id'], sponsor['type'], sponsor['name'].encode('ascii', 'replace'))) i = 0 for vote in bill['votes']: for vtype in ('yes', 'no', 'other'): for v in vote["%s_votes" % vtype]: if not v['leg_id']: missing.add((term_for_session(abbr, bill['session']), vote['chamber'], v['name'].encode('ascii', 'replace'))) if detailed: vote_csv.writerow( (abbr, bill['session'], bill['chamber'], bill['bill_id'], i, vote['chamber'], vote['motion'], vtype, v['name'].encode('ascii', 'replace'))) i += 1 if detailed: comm_csv = csv.writer( open("%s_missing_committee_leg_ids.csv" % abbr, 'w')) comm_csv.writerow(("Abbreviation", "Chamber", "Committee", "Subcommittee", "Role", "Name")) for committee in db.committees.find({'level': level, level: abbr}): for member in committee['members']: if not member['leg_id']: missing.add((committee.get('term', ''), committee['chamber'], member['name'].encode('ascii', 'replace'))) if detailed: com = committee['committee'].encode('ascii', 'replace') subcom = (committee['subcommittee'] or u'').encode('ascii', 'replace') comm_csv.writerow( (abbr, committee['chamber'], com, subcom, member['role'], member['name'].encode('ascii', 'replace'))) for item in missing: missing_csv.writerow(item)
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(type, value, tb): traceback.print_exception(type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning('Unknown companion: {chamber} {session} {bill_id}' .format(**companion)) # look for a prior version of this bill bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop((data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # add/update tracked_versions collection track_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def main(): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report', 'session-list'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events', 'speeches') _traceback = None try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: logging.getLogger('billy').critical('Error: %s', e) sys.exit(1)
def import_bill(data, votes): level = data['level'] abbr = data[level] # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) vote_matcher = VoteMatcher(abbr) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = term_for_session(abbr, data['session']) # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) # update keywords data['_keywords'] = list(bill_keywords(data)) if not bill: insert_with_id(data) else: update(bill, data, db.bills)
def scan_bills(abbr): duplicate_sources = defaultdict(int) duplicate_versions = defaultdict(int) other_actions = defaultdict(int) uncategorized_subjects = defaultdict(int) sessions = defaultdict(_bill_report_dict) # load exception data into sets of ids indexed by exception type quality_exceptions = get_quality_exceptions(abbr) for bill in db.bills.find({settings.LEVEL_FIELD: abbr}): session_d = sessions[bill['session']] # chamber count & bill_types if bill['chamber'] == 'lower': session_d['lower_count'] += 1 elif bill['chamber'] == 'upper': session_d['upper_count'] += 1 for type in bill['type']: session_d['bill_types'][type] += 1 update_common(bill, session_d) # actions last_date = datetime.datetime(1900, 1, 1) for action in bill['actions']: date = action['date'] if not date: continue if date < last_date: session_d['actions_unsorted'].add(bill['_id']) session_d['action_count'] += 1 for type in action['type']: session_d['actions_per_type'][type] += 1 if 'other' in action['type']: other_actions[action['action']] += 1 session_d['actions_per_actor'][action['actor']] += 1 session_d['actions_per_month'][date.strftime('%Y-%m')] += 1 # handle no_actions bills if not bill['actions']: if bill['_id'] not in quality_exceptions['bills:no_actions']: session_d['actionless_count'] += 1 else: quality_exceptions['bills:no_actions'].remove(bill['_id']) # sponsors for sponsor in bill['sponsors']: session_d['_sponsor_count'] += 1 if sponsor.get('leg_id') or sponsor.get('committee_id'): session_d['_sponsors_with_id_count'] += 1 else: # keep list of unmatched sponsors session_d['unmatched_sponsors'].add( (term_for_session(abbr, bill['session']), bill['chamber'], sponsor['name']) ) session_d['sponsors_per_type'][sponsor['type']] += 1 # handle no sponsors bills if not bill['sponsors']: if bill['_id'] not in quality_exceptions['bills:no_sponsors']: session_d['sponsorless_count'] += 1 else: quality_exceptions['bills:no_sponsors'].remove(bill['_id']) # subjects for subj in bill.get('scraped_subjects', []): uncategorized_subjects[subj] += 1 if bill.get('subjects'): session_d['_subjects_count'] += 1 for subject in bill['subjects']: session_d['bills_per_subject'][subject] += 1 # sources for source in bill['sources']: duplicate_sources[source['url']] += 1 # versions if not bill['versions']: # total num of bills w/o versions if bill['_id'] not in quality_exceptions['bills:no_versions']: session_d['versionless_count'] += 1 else: quality_exceptions['bills:no_versions'].remove(bill['_id']) else: # total num of versions session_d['version_count'] += len(bill['versions']) for doc in bill['versions']: duplicate_versions[doc['url']] += 1 # TODO: add duplicate document detection back in? # Check for progress meter gaps. progress_meter_gaps = session_d['progress_meter_gaps'] action_dates = bill['action_dates'] bill_chamber = bill['chamber'] other_chamber = dict(lower='upper', upper='lower')[bill_chamber] # Check for bills that were signed but didn't pass both chambers. if bill['type'] == 'bill': if action_dates['signed']: if not action_dates['passed_upper']: progress_meter_gaps.add(bill['_id']) elif not action_dates['passed_lower']: progress_meter_gaps.add(bill['_id']) else: # Check for nonbills that were signed but didn't pass their # house of origin. if action_dates['signed']: if not action_dates['passed_' + bill_chamber]: progress_meter_gaps.add(bill['_id']) if action_dates['passed_' + other_chamber]: if not action_dates['passed_' + bill_chamber]: progress_meter_gaps.add(bill['_id']) dup_version_urls = [] dup_source_urls = [] for url, n in duplicate_versions.items(): if n > 1: dup_version_urls.append(url) for url, n in duplicate_sources.items(): if n > 1: dup_source_urls.append(url) # do logging of unnecessary exceptions for qe_type, qes in quality_exceptions.items(): if qes: logger.warning('unnecessary {0} exceptions for {1} bills: \n {2}' .format(qe_type, len(qes), '\n '.join(qes))) return {'duplicate_versions': dup_version_urls, 'duplicate_sources': dup_source_urls, 'other_actions': other_actions.items(), 'uncategorized_subjects': uncategorized_subjects.items(), 'sessions': sessions, 'progress_meter_gaps': [] }
def dump_missing_leg_ids(abbr, detailed=False): """ For a given abbr, find all of the sponsorships, votes and committee memberships which are missing legislator IDs and output them to CSV files. """ missing_csv = csv.writer(open('%s_missing_leg_ids.csv' % abbr, 'w')) missing_csv.writerow(('term', 'chamber', 'name')) missing = set() level = metadata(abbr)['level'] if detailed: sponsor_csv = csv.writer(open('%s_missing_sponsor_leg_ids.csv' % abbr, 'w')) sponsor_csv.writerow(("Abbreviation", "Session", "Chamber", "Bill ID", "Sponsor Type", "Legislator Name")) vote_csv = csv.writer(open("%s_missing_vote_leg_ids.csv" % abbr, 'w')) vote_csv.writerow(("Abbreviation", "Session", "Chamber", "Bill ID", "Vote Index", "Vote Chamber", "Vote Motion", "Vote", "Name")) for bill in db.bills.find({'level': level, level: abbr}): for sponsor in bill['sponsors']: if not sponsor['leg_id']: missing.add((term_for_session(abbr, bill['session']), bill['chamber'], sponsor['name'].encode('ascii', 'replace'))) if detailed: sponsor_csv.writerow((abbr, bill['session'], bill['chamber'], bill['bill_id'], sponsor['type'], sponsor['name'].encode('ascii', 'replace'))) i = 0 for vote in bill['votes']: for vtype in ('yes', 'no', 'other'): for v in vote["%s_votes" % vtype]: if not v['leg_id']: missing.add((term_for_session(abbr, bill['session']), vote['chamber'], v['name'].encode('ascii', 'replace'))) if detailed: vote_csv.writerow((abbr, bill['session'], bill['chamber'], bill['bill_id'], i, vote['chamber'], vote['motion'], vtype, v['name'].encode('ascii', 'replace'))) i += 1 if detailed: comm_csv = csv.writer(open("%s_missing_committee_leg_ids.csv" % abbr, 'w')) comm_csv.writerow(("Abbreviation", "Chamber", "Committee", "Subcommittee", "Role", "Name")) for committee in db.committees.find({'level': level, level: abbr}): for member in committee['members']: if not member['leg_id']: missing.add((committee.get('term', ''), committee['chamber'], member['name'].encode('ascii', 'replace'))) if detailed: com = committee['committee'].encode('ascii', 'replace') subcom = (committee['subcommittee'] or u'').encode('ascii', 'replace') comm_csv.writerow((abbr, committee['chamber'], com, subcom, member['role'], member['name'].encode('ascii', 'replace'))) for item in missing: missing_csv.writerow(item)
def import_bill(data, votes, categorizer): level = data["level"] abbr = data[level] # clean up bill_ids data["bill_id"] = fix_bill_id(data["bill_id"]) if "alternate_bill_ids" in data: data["alternate_bill_ids"] = [fix_bill_id(bid) for bid in data["alternate_bill_ids"]] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop("subjects", None) if subjects: data["scraped_subjects"] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get("_partial_vote_bill_id"): # pull off numeric portion of bill_id numeric_bill_id = data["bill_id"].split()[1] bill_votes = votes.pop((data["chamber"], data["session"], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), []) data["votes"].extend(bill_votes) bill = db.bills.find_one( { "level": level, level: abbr, "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"], } ) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill["votes"]) doc_matcher.learn_ids(bill["versions"] + bill["documents"]) vote_matcher.set_ids(data["votes"]) doc_matcher.set_ids(data["versions"] + data["documents"]) # match sponsor leg_ids for sponsor in data["sponsors"]: id = get_legislator_id(abbr, data["session"], None, sponsor["name"]) sponsor["leg_id"] = id for vote in data["votes"]: # committee_ids if "committee" in vote: committee_id = get_committee_id(level, abbr, vote["chamber"], vote["committee"]) vote["committee_id"] = committee_id # vote leg_ids for vtype in ("yes_votes", "no_votes", "other_votes"): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data["session"], vote["chamber"], svote) svlist.append({"name": svote, "leg_id": id}) vote[vtype] = svlist data["_term"] = term_for_session(abbr, data["session"]) alt_titles = set(data.get("alternate_titles", [])) for version in data["versions"]: # push versions to oyster if settings.ENABLE_OYSTER and "url" in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if "title" in version: alt_titles.add(version["title"]) if "+short_title" in version: alt_titles.add(version["+short_title"]) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data["title"]) except KeyError: pass data["alternate_titles"] = list(alt_titles) if not bill: insert_with_id(data) return "insert" else: update(bill, data, db.bills) return "update"
def main(old_scrape_compat=False): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group( 'scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ( 'bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument( '--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) scrape.add_argument('--billid', help="scrape only a single bill", action="store", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(_type, value, tb): traceback.print_exception(_type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) _log.info(plan) scrape_data = {} if args.billid is False: _log.debug("No billid filter.") else: _log.debug("Search for billid: %s" % args.billid) if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) _log.debug("Session List %s" % session_list) try: schema_path = os.path.join( os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: _log.warning( 'metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, "state": abbrev } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters if args.billid is False: order = ( 'legislators', 'committees', 'votes', 'bills', 'events', 'speeches') else: _log.debug("going to process bills") order = ('bills',) # only process the bills _traceback = None try: for stype in order: _log.debug("consider to process %s" % stype) if stype in args.types: _log.debug("going to process %s" % stype) scraper_results = _run_scraper(stype, args, metadata) run_record += scraper_results else: _log.debug("skipping %s" % stype) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: _log.debug("scrape_data:") if scrape_data['failure']: _log.debug("Failed") _log.debug(scrape_data) else: _log.debug("OK") _log.debug(scrape_data) db.billy_runs.save(scrape_data, safe=True) except KeyError as e: _log.debug("Caught exception1 :") _log.debug(e) exit(123) except pymongo.errors.OperationFailure as e: _log.debug("Caught exception3 :") _log.debug(e) exit(123) except Exception as e: _log.debug("Caught exception :") _log.debug(e) exit(123) raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. _log.debug(scrape_data) db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: _log.debug("in update.py Scrape error") _log.debug("Scrape error :%s" % e) _log.critical('Error: %s' % e) sys.exit(1) except TypeError as e: _log.debug("Type error") _log.critical('TypeError:', e) sys.exit(1) except NoData as e: _log.debug("No Data") _log.debug(e) _log.critical('No Data:') sys.exit(1) except NoDoc as e: _log.debug("No Doc") _log.critical('No Doc:', e) sys.exit(1) except NoXpath as e: _log.debug("No XPath") _log.critical('No XPath:', e) sys.exit(1) except Exception as e: _log.debug("Unknown error3") _log.debug(e) _log.critical('Unknown Error') sys.exit(1)
def context_role(self, bill=None, vote=None, session=None, term=None): '''Tell this legislator object which session to use when calculating the legisator's context_role for a given bill or vote. ''' # If no hints were given about the context, look for a related bill, # then for a related vote. if not any([bill, vote, session, term]): try: bill = self.bill except AttributeError: # A vote? try: vote = self.vote except AttributeError: # If we're here, this method was called on a # Legislator was that doesn't have a related bill or vote. return '' # If we still have to historical point of reference, figuring # out the context role is impossible. Return emtpy string. if not any([bill, vote, session, term]): return '' # First figure out the term. if bill is not None: term = bill['_term'] elif vote is not None: try: _bill = vote.bill except AttributeError: _bill = BillVote(vote).bill if callable(_bill): _bill = _bill() term = _bill['_term'] if term is None and session is not None: term = term_for_session(self[settings.LEVEL_FIELD], session) # Use the term to get the related roles. First look in the current # roles list, then fail over to the old_roles list. roles = [ r for r in self['roles'] if r.get('type') == 'member' and r.get('term') == term ] roles = filter(None, roles) if not roles: roles = [ r for r in self['old_roles'].get(term, []) if r.get('type') == 'member' ] roles = filter(None, roles) if not roles: # Legislator had no roles for this term. If there is a related # bill ro vote, this shouldn't happen, but could if the # legislator's roles got deleted. return '' # If there's only one applicable role, we're done. if len(roles) == 1: role = roles.pop() self['context_role'] = role return role # If only one of term or session is given and there are multiple roles: if not filter(None, [bill, vote]): if term is not None: role = roles[0] self['context_role'] = role return role # Below, use the date of the related bill or vote to determine # which (of multiple) roles applies. # Get the context date. if session is not None: # If we're here, we have multiple roles for a single session. # Try to find the correct one in self.metadata, # else give up. session_data = self.metadata['session_details'][session] for role in roles: role_start = role.get('start_date') role_end = role.get('end_date') # Return the first role that overlaps at all with the # session. session_start = session_data.get('start_date') session_end = session_data.get('end_date') if session_start and session_end: started_during = (role_start < session_start < role_end) ended_during = (role_start < session_end < role_end) if started_during or ended_during: self['context_role'] = role return role else: continue # Return first role from the session? role = roles[0] self['context_role'] = role return role if vote is not None: date = vote['date'] if bill is not None: date = bill['action_dates']['first'] dates_exist = False for role in roles: start_date = role.get('start_date') end_date = role.get('end_date') if start_date and end_date: dates_exist = True if start_date < date < end_date: self['context_role'] = role return role if dates_exist: # If we're here, the context date didn't fall into any of the # legislator's role date ranges. return '' else: # Here the roles didn't have date ranges. Return the last one? role = roles.pop() self['context_role'] = role return role return ''
def context_role(self, bill=None, vote=None, session=None, term=None): '''Tell this legislator object which session to use when calculating the legisator's context_role for a given bill or vote. ''' # If no hints were given about the context, look for a related bill, # then for a related vote. if not any([bill, vote, session, term]): try: bill = self.bill except AttributeError: # A vote? try: vote = self.vote except AttributeError: # If we're here, this method was called on a # Legislator was that doesn't have a related bill or vote. return '' # If we still have to historical point of reference, figuring # out the context role is impossible. Return emtpy string. if not any([bill, vote, session, term]): return '' # First figure out the term. if bill is not None: term = bill['_term'] elif vote is not None: try: _bill = vote.bill except AttributeError: _bill = BillVote(vote).bill if callable(_bill): _bill = _bill() term = _bill['_term'] if term is None and session is not None: term = term_for_session(self[settings.LEVEL_FIELD], session) # Use the term to get the related roles. First look in the current # roles list, then fail over to the old_roles list. roles = [r for r in self['roles'] if r.get('type') == 'member' and r.get('term') == term] roles = filter(None, roles) if not roles: roles = [r for r in self['old_roles'].get(term, []) if r.get('type') == 'member'] roles = filter(None, roles) if not roles: # Legislator had no roles for this term. If there is a related # bill ro vote, this shouldn't happen, but could if the # legislator's roles got deleted. return '' # If there's only one applicable role, we're done. if len(roles) == 1: role = roles.pop() self['context_role'] = role return role # If only one of term or session is given and there are multiple roles: if not filter(None, [bill, vote]): if term is not None: role = roles[0] self['context_role'] = role return role # Below, use the date of the related bill or vote to determine # which (of multiple) roles applies. # Get the context date. if session is not None: # If we're here, we have multiple roles for a single session. # Try to find the correct one in self.metadata, # else give up. session_data = self.metadata['session_details'][session] for role in roles: role_start = role.get('start_date') role_end = role.get('end_date') # Return the first role that overlaps at all with the # session. session_start = session_data.get('start_date') session_end = session_data.get('end_date') if session_start and session_end: started_during = (role_start < session_start < role_end) ended_during = (role_start < session_end < role_end) if started_during or ended_during: self['context_role'] = role return role else: continue # Return first role from the session? role = roles[0] self['context_role'] = role return role if vote is not None: date = vote['date'] if bill is not None: date = bill['action_dates']['first'] dates_exist = False for role in roles: start_date = role.get('start_date') end_date = role.get('end_date') if start_date and end_date: dates_exist = True if start_date < date < end_date: self['context_role'] = role return role if dates_exist: # If we're here, the context date didn't fall into any of the # legislator's role date ranges. return '' else: # Here the roles didn't have date ranges. Return the last one? role = roles.pop() self['context_role'] = role return role return ''
def scan_bills(abbr): duplicate_sources = defaultdict(int) duplicate_versions = defaultdict(int) other_actions = defaultdict(int) uncategorized_subjects = defaultdict(int) sessions = defaultdict(_bill_report_dict) # load exception data into sets of ids indexed by exception type quality_exceptions = get_quality_exceptions(abbr) for bill in db.bills.find({settings.LEVEL_FIELD: abbr}): session_d = sessions[bill["session"]] # chamber count & bill_types if bill["chamber"] == "lower": session_d["lower_count"] += 1 elif bill["chamber"] == "upper": session_d["upper_count"] += 1 for type in bill["type"]: session_d["bill_types"][type] += 1 update_common(bill, session_d) # actions last_date = datetime.datetime(1900, 1, 1) for action in bill["actions"]: date = action["date"] if date < last_date: session_d["actions_unsorted"].add(bill["_id"]) session_d["action_count"] += 1 for type in action["type"]: session_d["actions_per_type"][type] += 1 if "other" in action["type"]: other_actions[action["action"]] += 1 session_d["actions_per_actor"][action["actor"]] += 1 session_d["actions_per_month"][date.strftime("%Y-%m")] += 1 # handle no_actions bills if not bill["actions"]: if bill["_id"] not in quality_exceptions["bills:no_actions"]: session_d["actionless_count"] += 1 else: quality_exceptions["bills:no_actions"].remove(bill["_id"]) # sponsors for sponsor in bill["sponsors"]: session_d["_sponsor_count"] += 1 if sponsor.get("leg_id") or sponsor.get("committee_id"): session_d["_sponsors_with_id_count"] += 1 else: # keep list of unmatched sponsors session_d["unmatched_sponsors"].add( (term_for_session(abbr, bill["session"]), bill["chamber"], sponsor["name"]) ) session_d["sponsors_per_type"][sponsor["type"]] += 1 # handle no sponsors bills if not bill["sponsors"]: if bill["_id"] not in quality_exceptions["bills:no_sponsors"]: session_d["sponsorless_count"] += 1 else: quality_exceptions["bills:no_sponsors"].remove(bill["_id"]) # subjects for subj in bill.get("scraped_subjects", []): uncategorized_subjects[subj] += 1 if bill.get("subjects"): session_d["_subjects_count"] += 1 for subject in bill["subjects"]: session_d["bills_per_subject"][subject] += 1 # sources for source in bill["sources"]: duplicate_sources[source["url"]] += 1 # versions if not bill["versions"]: # total num of bills w/o versions if bill["_id"] not in quality_exceptions["bills:no_versions"]: session_d["versionless_count"] += 1 else: quality_exceptions["bills:no_versions"].remove(bill["_id"]) else: # total num of versions session_d["version_count"] += len(bill["versions"]) for doc in bill["versions"]: duplicate_versions[doc["url"]] += 1 # TODO: add duplicate document detection back in? dup_version_urls = [] dup_source_urls = [] for url, n in duplicate_versions.iteritems(): if n > 1: dup_version_urls.append(url) for url, n in duplicate_sources.iteritems(): if n > 1: dup_source_urls.append(url) # do logging of unnecessary exceptions for qe_type, qes in quality_exceptions.iteritems(): if qes: logger.warning( "unnecessary {0} exceptions for {1} bills: \n {2}".format(qe_type, len(qes), "\n ".join(qes)) ) return { "duplicate_versions": dup_version_urls, "duplicate_sources": dup_source_urls, "other_actions": other_actions.items(), "uncategorized_subjects": uncategorized_subjects.items(), "sessions": sessions, }
def scan_bills(abbr): duplicate_sources = defaultdict(int) duplicate_versions = defaultdict(int) other_actions = defaultdict(int) uncategorized_subjects = defaultdict(int) sessions = defaultdict(_bill_report_dict) # load exception data into sets of ids indexed by exception type quality_exceptions = get_quality_exceptions(abbr) for bill in db.bills.find({settings.LEVEL_FIELD: abbr}): session_d = sessions[bill['session']] # chamber count & bill_types if bill['chamber'] == 'lower': session_d['lower_count'] += 1 elif bill['chamber'] == 'upper': session_d['upper_count'] += 1 for type in bill['type']: session_d['bill_types'][type] += 1 update_common(bill, session_d) # actions last_date = datetime.datetime(1900, 1, 1) for action in bill['actions']: date = action['date'] if date < last_date: session_d['actions_unsorted'].add(bill['_id']) session_d['action_count'] += 1 for type in action['type']: session_d['actions_per_type'][type] += 1 if 'other' in action['type']: other_actions[action['action']] += 1 session_d['actions_per_actor'][action['actor']] += 1 session_d['actions_per_month'][date.strftime('%Y-%m')] += 1 # handle no_actions bills if not bill['actions']: if bill['_id'] not in quality_exceptions['bills:no_actions']: session_d['actionless_count'] += 1 else: quality_exceptions['bills:no_actions'].remove(bill['_id']) # sponsors for sponsor in bill['sponsors']: session_d['_sponsor_count'] += 1 if sponsor.get('leg_id') or sponsor.get('committee_id'): session_d['_sponsors_with_id_count'] += 1 else: # keep list of unmatched sponsors session_d['unmatched_sponsors'].add( (term_for_session(abbr, bill['session']), bill['chamber'], sponsor['name']) ) session_d['sponsors_per_type'][sponsor['type']] += 1 # handle no sponsors bills if not bill['sponsors']: if bill['_id'] not in quality_exceptions['bills:no_sponsors']: session_d['sponsorless_count'] += 1 else: quality_exceptions['bills:no_sponsors'].remove(bill['_id']) # subjects for subj in bill.get('scraped_subjects', []): uncategorized_subjects[subj] += 1 if bill.get('subjects'): session_d['_subjects_count'] += 1 for subject in bill['subjects']: session_d['bills_per_subject'][subject] += 1 # sources for source in bill['sources']: duplicate_sources[source['url']] += 1 # versions if not bill['versions']: # total num of bills w/o versions if bill['_id'] not in quality_exceptions['bills:no_versions']: session_d['versionless_count'] += 1 else: quality_exceptions['bills:no_versions'].remove(bill['_id']) else: # total num of versions session_d['version_count'] += len(bill['versions']) for doc in bill['versions']: duplicate_versions[doc['url']] += 1 # TODO: add duplicate document detection back in? # Check for progress meter gaps. progress_meter_gaps = session_d['progress_meter_gaps'] action_dates = bill['action_dates'] bill_chamber = bill['chamber'] other_chamber = dict(lower='upper', upper='lower')[bill_chamber] # Check for bills that were signed but didn't pass both chambers. if bill['type'] == 'bill': if action_dates['signed']: if not action_dates['passed_upper']: progress_meter_gaps.add(bill['_id']) elif not action_dates['passed_lower']: progress_meter_gaps.add(bill['_id']) else: # Check for nonbills that were signed but didn't pass their # house of origin. if action_dates['signed']: if not action_dates['passed_' + bill_chamber]: progress_meter_gaps.add(bill['_id']) if action_dates['passed_' + other_chamber]: if not action_dates['passed_' + bill_chamber]: progress_meter_gaps.add(bill['_id']) dup_version_urls = [] dup_source_urls = [] for url, n in duplicate_versions.items(): if n > 1: dup_version_urls.append(url) for url, n in duplicate_sources.items(): if n > 1: dup_source_urls.append(url) # do logging of unnecessary exceptions for qe_type, qes in quality_exceptions.items(): if qes: logger.warning('unnecessary {0} exceptions for {1} bills: \n {2}' .format(qe_type, len(qes), '\n '.join(qes))) return {'duplicate_versions': dup_version_urls, 'duplicate_sources': dup_source_urls, 'other_actions': other_actions.items(), 'uncategorized_subjects': uncategorized_subjects.items(), 'sessions': sessions, 'progress_meter_gaps': [] }
def scan_bills(abbr): metadata = db.metadata.find_one({'_id': abbr}) level = metadata['level'] duplicate_sources = defaultdict(int) duplicate_versions = defaultdict(int) other_actions = defaultdict(int) uncategorized_subjects = defaultdict(int) sessions = defaultdict(_bill_report_dict) for bill in db.bills.find({'level': level, level: abbr}): session_d = sessions[bill['session']] # chamber count & bill_types if bill['chamber'] == 'lower': session_d['lower_count'] += 1 elif bill['chamber'] == 'upper': session_d['upper_count'] += 1 for type in bill['type']: session_d['bill_types'][type] += 1 update_common(bill, session_d) # actions last_date = datetime.datetime(1900, 1, 1) for action in bill['actions']: date = action['date'] if date < last_date: session_d['actions_unsorted'].add(bill['_id']) session_d['action_count'] += 1 for type in action['type']: session_d['actions_per_type'][type] += 1 if 'other' in action['type']: other_actions[action['action']] += 1 session_d['actions_per_actor'][action['actor']] += 1 session_d['actions_per_month'][date.strftime('%Y-%m')] += 1 if not bill['actions']: session_d['actionless_count'] += 1 # sponsors for sponsor in bill['sponsors']: session_d['_sponsor_count'] += 1 if sponsor.get('leg_id'): session_d['_sponsors_with_leg_id_count'] += 1 else: # keep missing leg_ids session_d['unmatched_leg_ids'].add( (term_for_session(abbr, bill['session']), bill['chamber'], sponsor['name']) ) session_d['sponsors_per_type'][sponsor['type']] += 1 if not bill['sponsors']: session_d['sponsorless_count'] += 1 # votes for vote in bill['votes']: session_d['vote_count'] += 1 if vote['passed']: session_d['_passed_vote_count'] += 1 session_d['votes_per_chamber'][vote['chamber']] += 1 if not vote.get('type'): logger.warning('vote is missing type on %s' % bill['_id']) continue session_d['votes_per_type'][vote.get('type')] += 1 if not vote.get('date'): logger.warning('vote is missing date on %s' % bill['_id']) continue session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1 # roll calls has_rollcalls = False for rc in (vote['yes_votes'] + vote['no_votes'] + vote['other_votes']): has_rollcalls = True session_d['_rollcall_count'] += 1 if rc.get('leg_id'): session_d['_rollcalls_with_leg_id_count'] += 1 else: # keep missing leg_ids session_d['unmatched_leg_ids'].add( (term_for_session(abbr, bill['session']), vote['chamber'], rc['name']) ) # check counts if any rollcalls are present if (has_rollcalls and (len(vote['yes_votes']) != vote['yes_count'] or len(vote['no_votes']) != vote['no_count'] or len(vote['other_votes']) != vote['other_count'])): session_d['bad_vote_counts'].add(bill['_id']) # subjects for subj in bill.get('scraped_subjects', []): uncategorized_subjects[subj] += 1 if bill.get('subjects'): session_d['_subjects_count'] += 1 for subject in bill['subjects']: session_d['bills_per_subject'][subject] += 1 # sources for source in bill['sources']: duplicate_sources[source['url']] += 1 # versions if not bill['versions']: # total num of bills w/o versions session_d['versionless_count'] += 1 else: # total num of versions session_d['version_count'] += len(bill['versions']) for doc in bill['versions']: duplicate_versions[doc['url']] += 1 # TODO: add a duplicate documents back in? dup_version_urls = [] dup_source_urls = [] for url, n in duplicate_versions.iteritems(): if n > 1: dup_version_urls.append(url) for url, n in duplicate_sources.iteritems(): if n > 1: dup_source_urls.append(url) return {'duplicate_versions': dup_version_urls, 'duplicate_sources': dup_source_urls, 'other_actions': other_actions.items(), 'uncategorized_subjects': uncategorized_subjects.items(), 'sessions': sessions, }
def import_bill(data, votes, categorizer): level = data['level'] abbr = data[level] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get('_partial_vote_bill_id'): # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes = votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill['votes']) doc_matcher.learn_ids(bill['versions'] + bill['documents']) vote_matcher.set_ids(data['votes']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id if id is None: cid = get_committee_id(level, abbr, data['chamber'], sponsor['name']) if not cid is None: sponsor['committee_id'] = cid # process votes for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist # process actions dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} for action in data['actions']: # We'll try to recover some Committee IDs here. if "committee" in action: cid = get_committee_id(level, abbr, data['chamber'], action['committee']) action['_scraped_committee_name'] = action['committee'] if cid is not None: action['committee'] = cid else: del(action['committee']) adate = action['date'] # first & last if not dates['first'] or adate < dates['first']: dates['first'] = adate elif not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: bill_id = insert_with_id(data) denormalize_votes(data, bill_id) return "insert" else: update(bill, data, db.bills) denormalize_votes(data, bill['_id']) return "update"
def vote_csv(state, session, chamber, out=sys.stdout): term = utils.term_for_session(state, session) votes = {} legislators = {} elemMatch = {'state': state, 'chamber': chamber, 'type': 'member', 'term': term} for leg in db.legislators.find({'$or': [{'roles': {'$elemMatch': elemMatch}}, {('old_roles.%s' % term): {'$elemMatch': elemMatch}}]}): votes[leg['leg_id']] = [] legislators[leg['leg_id']] = leg for bill in db.bills.find({'state': state, 'chamber': chamber, 'session': session}): for vote in bill['votes']: if 'committee' in vote and vote['committee']: continue if vote['chamber'] != chamber: continue seen = set() for yv in vote['yes_votes']: leg_id = yv['leg_id'] if leg_id: seen.add(leg_id) try: votes[leg_id].append(1) except KeyError: continue for nv in vote['no_votes']: leg_id = nv['leg_id'] if leg_id: seen.add(leg_id) try: votes[leg_id].append(6) except KeyError: continue for leg_id in set(votes.keys()) - seen: votes[leg_id].append(9) out = csv.writer(out) for (leg_id, vs) in votes.iteritems(): leg = legislators[leg_id] try: party = leg['old_roles'][term][0]['party'] except KeyError: party = leg['party'] row = [leg['full_name'].encode('ascii', 'replace'), leg['leg_id'], party] for vote in vs: row.append(str(vote)) out.writerow(row)
def vote_csv(state, session, chamber, out=sys.stdout): term = utils.term_for_session(state, session) votes = {} legislators = {} elemMatch = { 'state': state, 'chamber': chamber, 'type': 'member', 'term': term } for leg in db.legislators.find({ '$or': [{ 'roles': { '$elemMatch': elemMatch } }, { ('old_roles.%s' % term): { '$elemMatch': elemMatch } }] }): votes[leg['leg_id']] = [] legislators[leg['leg_id']] = leg for bill in db.bills.find({ 'state': state, 'chamber': chamber, 'session': session }): for vote in bill['votes']: if 'committee' in vote and vote['committee']: continue if vote['chamber'] != chamber: continue seen = set() for yv in vote['yes_votes']: leg_id = yv['leg_id'] if leg_id: seen.add(leg_id) try: votes[leg_id].append(1) except KeyError: continue for nv in vote['no_votes']: leg_id = nv['leg_id'] if leg_id: seen.add(leg_id) try: votes[leg_id].append(6) except KeyError: continue for leg_id in set(votes.keys()) - seen: votes[leg_id].append(9) out = csv.writer(out) for (leg_id, vs) in votes.iteritems(): leg = legislators[leg_id] try: party = leg['old_roles'][session][0]['party'] except KeyError: party = leg['party'] row = [ leg['full_name'].encode('ascii', 'replace'), leg['leg_id'], party ] for vote in vs: row.append(str(vote)) out.writerow(row)
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [ fix_bill_id(bid) for bid in data['alternate_bill_ids'] ] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning( 'Unknown companion: {chamber} {session} {bill_id}'.format( **companion)) # look for a prior version of this bill bill = db.bills.find_one({ settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop( (data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop( (data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = { 'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None } vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) elasticsearch_push(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) elasticsearch_push(bill) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def import_bill(data, votes, categorizer): level = data['level'] abbr = data[level] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [ fix_bill_id(bid) for bid in data['alternate_bill_ids'] ] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get('_partial_vote_bill_id'): # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes = votes.pop( (data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop( (data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill['votes']) doc_matcher.learn_ids(bill['versions'] + bill['documents']) vote_matcher.set_ids(data['votes']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: insert_with_id(data) return "insert" else: update(bill, data, db.bills) return "update"