def main(): parser = argparse.ArgumentParser( description='Scrape data for single bill, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert( 0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.module, fromlist=['metadata']).metadata abbr = metadata['abbreviation'] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR)
def main(): import argparse parser = argparse.ArgumentParser( description=('Dump data to a set of CSV files, optionally uploading to' ' S3 when done.'), parents=[base_arg_parser], ) parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)') parser.add_argument('--nozip', action='store_true', default=False, help="don't zip the files") parser.add_argument('--upload', '-u', action='store_true', default=False, help='upload the created archive to S3') args = parser.parse_args() settings.update(args) for abbr in args.abbrs: print 'dumping CSV for', abbr if not args.file: args.file = '{0}_csv.zip'.format(abbr) dump_csv(abbr, args.file, args.nozip) if args.upload: if args.nozip: raise Warning('Unable to --upload if --nozip is specified') else: upload(abbr, args.file)
def main(): parser = argparse.ArgumentParser( description='apply subject categorization for bills', parents=[base_arg_parser], conflict_handler='resolve', ) default_dir = os.path.join(os.path.dirname(__file__), '../../manual_data/subjects') parser.add_argument('abbr', type=str, help='abbreviation for data to process') parser.add_argument('--all', help='update all sessions', action='store_true', default=False) parser.add_argument('-d', '--data_dir', help='directory of subject csvs', dest='data_dir', default=default_dir) args = parser.parse_args() settings.update(args) categorize_subjects(args.abbr, args.data_dir, args.all)
def main(): parser = argparse.ArgumentParser(description="set a legislators term end_date", parents=[base_arg_parser]) parser.add_argument("leg_id", type=str, help="id of legislator to retire") parser.add_argument("date", type=str, help="YYYY-MM-DD date to set for legislator end_date") args = parser.parse_args() settings.update(args) retire_legislator(args.leg_id, args.date)
def main(): import sys import argparse parser = argparse.ArgumentParser(description='send bill versions to oyster', parents=[base_arg_parser]) parser.add_argument('states', nargs='+', help='states to oysterize') args = parser.parse_args() settings.update(args) for state in args.states: print "Oysterizing %s bill versions" % state oysterize_versions(state)
def main(): parser = argparse.ArgumentParser( description='Import scraped data into database.', parents=[base_arg_parser], ) parser.add_argument('abbreviation', type=str, help=('the short name of the data to import')) parser.add_argument('-r', '--rpm', type=int, default=60, help=('maximum number of documents to download ' 'per minute')) parser.add_argument('--bills', action='store_true', help='scrape bill data') parser.add_argument('--legislators', action='store_true', help='scrape legislator data') parser.add_argument('--committees', action='store_true', help='scrape (separate) committee data') parser.add_argument('--events', action='store_true', help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="import all available data") args = parser.parse_args() if not (args.bills or args.legislators or args.committees or args.events or args.alldata): raise Exception("Must specify at least one type: --bills, " "--legislators, --committees, --events, " "--alldata") settings.update(args) data_dir = settings.BILLY_DATA_DIR # configure logger configure_logging(args.verbose, args.abbreviation) # always import metadata import_metadata(args.abbreviation, data_dir) if args.legislators or args.alldata: import_legislators(args.abbreviation, data_dir) if args.bills or args.alldata: import_bills(args.abbreviation, data_dir) if args.committees or args.alldata: import_committees(args.abbreviation, data_dir) # events currently excluded from --alldata if args.events: import_events(args.abbreviation, data_dir)
def main(): parser = argparse.ArgumentParser( description='run name matching against a session', parents=[base_arg_parser], ) parser.add_argument('abbr', help='abbr to run matching for') parser.add_argument('term', help='term to run matching for') args = parser.parse_args() settings.update(args) match_names(args.abbr, args.term)
def main(): import argparse configure_logging(1) parser = argparse.ArgumentParser( description=('Dump API information to a zipped directory of JSON files' ', optionally uploading to S3 when done.'), parents=[base_arg_parser], ) parser.add_argument( 'abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)') parser.add_argument('--schema_dir', help='directory to use for API schemas (optional)', default=None) parser.add_argument('--nodump', action='store_true', default=False, help="don't run the dump, only upload") parser.add_argument('--novalidate', action='store_true', default=False, help="don't run validation") parser.add_argument('--upload', '-u', action='store_true', default=False, help='upload the created archive to S3') args = parser.parse_args() settings.update(args) for abbr in args.abbrs: if not args.file: args.file = abbr + '.zip' if not args.nodump: dump_json(abbr, args.file, not args.novalidate, args.schema_dir) if args.upload: upload(abbr, args.file)
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] # configure logger configure_logging(args.verbose, state) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) _run_scraper(args.state, state, args, metadata) if args.do_import: import_bills(args.state, settings.BILLY_DATA_DIR)
def main(): parser = argparse.ArgumentParser( description='Populate database with district information.', parents=[base_arg_parser], ) parser.add_argument('files', metavar='DISTRICT-FILE', type=str, nargs='+', help=('the path to the district CSV file to import')) args = parser.parse_args() settings.update(args) for file in args.files: import_district_csv(file)
def main(): parser = argparse.ArgumentParser( description="apply subject categorization for bills", parents=[base_arg_parser], conflict_handler="resolve" ) default_dir = os.path.join(os.path.dirname(__file__), "../../manual_data/subjects") parser.add_argument("abbr", type=str, help="abbreviation for data to process") parser.add_argument("--all", help="update all sessions", action="store_true", default=False) parser.add_argument("-d", "--data_dir", help="directory of subject csvs", dest="data_dir", default=default_dir) args = parser.parse_args() settings.update(args) categorize_subjects(args.abbr, args.data_dir, args.all)
def main(): parser = argparse.ArgumentParser( description='apply subject categorization for bills', parents=[base_arg_parser], conflict_handler='resolve', ) parser.add_argument('abbr', type=str, help='abbreviation for data to process') parser.add_argument('--all', help='update all sessions', action='store_true', default=False) args = parser.parse_args() settings.update(args) categorize_subjects(args.abbr, args.all)
def main(): parser = argparse.ArgumentParser( description="dump a CSV of missing leg_id's", parents=[base_arg_parser], ) parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+', help='data abbreviations to dump') parser.add_argument('--detailed', action='store_true', default=False, help='print detailed csvs as well') args = parser.parse_args() settings.update(args) for abbr in args.abbrs: dump_missing_leg_ids(abbr, args.detailed)
def main(): try: parser = argparse.ArgumentParser( description="Scrape data for single bill, saving data to disk.", parents=[base_arg_parser] ) parser.add_argument("module", type=str, help="scraper module (eg. nc)") parser.add_argument("chamber", type=str, help="chamber for bill to scrape") parser.add_argument("session", type=str, help="session for bill to scrape") parser.add_argument("bill_id", type=str, help="bill_id to scrape") parser.add_argument( "--strict", action="store_true", dest="strict", default=False, help="fail immediately when" "encountering validation warning", ) parser.add_argument("-n", "--no_cache", action="store_true", dest="no_cache", help="don't use web page cache") parser.add_argument("--fastmode", help="scrape in fast mode", action="store_true", default=False) parser.add_argument("-r", "--rpm", action="store", type=int, dest="rpm", default=60), parser.add_argument( "--import", dest="do_import", help="import bill after scrape", action="store_true", default=False ) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../openstates")) # get metadata metadata = __import__(args.module, fromlist=["metadata"]).metadata abbr = metadata["abbreviation"] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR) except ScrapeError as e: print "Error:", e sys.exit(1)
def main(): import sys import argparse parser = argparse.ArgumentParser(description='validate API results', parents=[base_arg_parser]) parser.add_argument('states', nargs='+', help='states to validate') parser.add_argument('--schema_dir', help='directory to use for API schemas (optional)', default=None) args = parser.parse_args() settings.update(args) for state in args.states: print "Validating %s" % state validate_api(state, args.schema_dir)
def main(): parser = argparse.ArgumentParser( description='set a legislators term end_date', parents=[base_arg_parser], ) parser.add_argument('leg_id', type=str, help='id of legislator to retire') parser.add_argument('date', type=str, help='YYYY-MM-DD date to set for legislator end_date') args = parser.parse_args() settings.update(args) retire_legislator(args.leg_id, args.date)
def main(): parser = argparse.ArgumentParser( description='Convert state bills to SFM-ready text', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state') parser.add_argument('--sfm_server', type=str, help='URL of SFM instance', default='http://localhost:8080/') args = parser.parse_args() settings.update(args) configure_logging(args.verbose, args.state) process_state_files(args.state, args.sfm_server)
def main(): try: parser = argparse.ArgumentParser( description='Scrape data for single bill, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('chamber', type=str, help='chamber for bill to scrape') parser.add_argument('session', type=str, help='session for bill to scrape') parser.add_argument('bill_id', type=str, help='bill_id to scrape') parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--import', dest='do_import', help="import bill after scrape", action="store_true", default=False) args = parser.parse_args() settings.update(args) # get metadata metadata = __import__(args.module, fromlist=['metadata']).metadata abbr = metadata['abbreviation'] # configure logger configure_logging(args.verbose, abbr) args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbr) _run_scraper(args, metadata) if args.do_import: import_bills(abbr, settings.BILLY_DATA_DIR) except ScrapeError as e: print 'Error:', e sys.exit(1)
def main(): parser = argparse.ArgumentParser( description='load a CSV of legislator data', parents=[base_arg_parser], ) parser.add_argument('files', metavar='FILE', type=str, nargs='+', help='filenames to import') parser.add_argument('--save', action='store_true', default=False, help='save changes to database (default is dry run)') args = parser.parse_args() settings.update(args) for file in args.files: process_file(file, args.save)
def main(): import argparse parser = argparse.ArgumentParser( description=('Dump data to a set of CSV files, optionally uploading to' ' S3 when done.'), parents=[base_arg_parser], ) parser.add_argument( 'abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)') parser.add_argument('--nozip', action='store_true', default=False, help="don't zip the files") parser.add_argument('--upload', '-u', action='store_true', default=False, help='upload the created archive to S3') args = parser.parse_args() settings.update(args) for abbr in args.abbrs: print 'dumping CSV for', abbr if not args.file: args.file = '{0}_csv.zip'.format(abbr) dump_csv(abbr, args.file, args.nozip) if args.upload: if args.nozip: raise Warning('Unable to --upload if --nozip is specified') else: upload(abbr, args.file)
def main(): parser = argparse.ArgumentParser(description='generic billy util', parents=[base_arg_parser]) subparsers = parser.add_subparsers(dest='subcommand') # import command plugins for mod in COMMAND_MODULES: import_command_module(mod) # instantiate all subcommands subcommands = {} for SubcommandCls in BaseCommand.subcommands: subcommands[SubcommandCls.name] = SubcommandCls(subparsers) # parse arguments, update settings, then run the appropriate function args = parser.parse_args() settings.update(args) configure_logging(args.subcommand) subcommands[args.subcommand].handle(args)
def main(): parser = argparse.ArgumentParser( description=('attempt to match legislators with ids in other' 'relevant APIs'), parents=[base_arg_parser], ) parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+', help='abbreviations for data to update') args = parser.parse_args() settings.update(args) votesmart.apikey = settings.VOTESMART_API_KEY for abbr in args.abbrs: update_missing_ids(abbr, settings.SUNLIGHT_SERVICES_KEY) time.sleep(30)
def main(): import argparse configure_logging(1) parser = argparse.ArgumentParser( description=('Dump API information to a zipped directory of JSON files' ', optionally uploading to S3 when done.'), parents=[base_arg_parser], ) parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+', help=('the two-letter abbreviation for the data to export')) parser.add_argument('--file', '-f', help='filename to output to (defaults to <abbr>.zip)') parser.add_argument('--schema_dir', help='directory to use for API schemas (optional)', default=None) parser.add_argument('--nodump', action='store_true', default=False, help="don't run the dump, only upload") parser.add_argument('--novalidate', action='store_true', default=False, help="don't run validation") parser.add_argument('--upload', '-u', action='store_true', default=False, help='upload the created archive to S3') args = parser.parse_args() settings.update(args) for abbr in args.abbrs: if not args.file: args.file = abbr + '.zip' if not args.nodump: dump_json(abbr, args.file, not args.novalidate, args.schema_dir) if args.upload: upload(abbr, args.file)
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('-s', '--session', action='append', dest='sessions', help='session(s) to scrape') parser.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape') parser.add_argument('--upper', action='store_true', dest='upper', default=False, help='scrape upper chamber') parser.add_argument('--lower', action='store_true', dest='lower', default=False, help='scrape lower chamber') parser.add_argument('--bills', action='store_true', dest='bills', default=False, help="scrape bill data") parser.add_argument('--legislators', action='store_true', dest='legislators', default=False, help="scrape legislator data") parser.add_argument('--committees', action='store_true', dest='committees', default=False, help="scrape committee data") parser.add_argument('--votes', action='store_true', dest='votes', default=False, help="scrape vote data") parser.add_argument('--events', action='store_true', dest='events', default=False, help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="scrape all available types of data") parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), parser.add_argument('--timeout', action='store', type=int, dest='timeout', default=10) args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] configure_logging(args.verbose, args.state) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state) try: os.makedirs(args.output_dir) except OSError as e: if e.errno != 17: raise e # write metadata try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning('metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) # determine time period to run for if args.terms: for term in metadata['terms']: if term in args.terms: args.sessions.extend(term['sessions']) args.sessions = set(args.sessions or []) # determine chambers args.chambers = [] if args.upper: args.chambers.append('upper') if args.lower: args.chambers.append('lower') if not args.chambers: args.chambers = ['upper', 'lower'] if not (args.bills or args.legislators or args.votes or args.committees or args.events or args.alldata): raise ScrapeError("Must specify at least one of --bills, " "--legislators, --committees, --votes, --events, " "--alldata") if args.alldata: args.bills = True args.legislators = True args.votes = True args.committees = True if args.bills: _run_scraper(args.state, state, 'bills', args, metadata) if args.legislators: _run_scraper(args.state, state, 'legislators', args, metadata) if args.committees: _run_scraper(args.state, state, 'committees', args, metadata) if args.votes: _run_scraper(args.state, state, 'votes', args, metadata) if args.events: _run_scraper(args.state, state, 'events', args, metadata)
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('-s', '--session', action='append', dest='sessions', help='session(s) to scrape') parser.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape') parser.add_argument('--upper', action='store_true', dest='upper', default=False, help='scrape upper chamber') parser.add_argument('--lower', action='store_true', dest='lower', default=False, help='scrape lower chamber') parser.add_argument('--bills', action='store_true', dest='bills', default=False, help="scrape bill data") parser.add_argument('--legislators', action='store_true', dest='legislators', default=False, help="scrape legislator data") parser.add_argument('--committees', action='store_true', dest='committees', default=False, help="scrape committee data") parser.add_argument('--votes', action='store_true', dest='votes', default=False, help="scrape vote data") parser.add_argument('--events', action='store_true', dest='events', default=False, help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="scrape all available types of data") parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), args = parser.parse_args() settings.update(args) # set up search path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] # configure logger if args.verbose == 0: verbosity = logging.WARNING elif args.verbose == 1: verbosity = logging.INFO else: verbosity = logging.DEBUG logging.basicConfig(level=verbosity, format="%(asctime)s %(name)s %(levelname)s " + state + " %(message)s", datefmt="%H:%M:%S", ) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state) try: os.makedirs(args.output_dir) except OSError, e: if e.errno != 17: raise e
def main(old_scrape_compat=False): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = __import__(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load state settings, then command line settings settings.update(module_settings) settings.update(args) configure_logging(args.module) # configure oyster if settings.ENABLE_OYSTER: from oyster.conf import settings as oyster_settings oyster_settings.DOCUMENT_CLASSES[ args.module + ':billtext'] = module.document_class # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) _clear_scraped_data(args.output_dir) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: if old_scrape_compat: args.actions = ['scrape'] else: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = [ 'bills', 'legislators', 'votes', 'committees', 'alldata' ] if 'events' in metadata['feature_flags']: args.types.append('events') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join( args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join( os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, "state": abbrev } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events') try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['state'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) except ScrapeError as e: print 'Error:', e sys.exit(1)
def main(): parser = argparse.ArgumentParser( description='Scrape data for state, saving data to disk.', parents=[base_arg_parser], ) parser.add_argument('state', type=str, help='state scraper module (eg. nc)') parser.add_argument('-s', '--session', action='append', dest='sessions', help='session(s) to scrape') parser.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape') parser.add_argument('--upper', action='store_true', dest='upper', default=False, help='scrape upper chamber') parser.add_argument('--lower', action='store_true', dest='lower', default=False, help='scrape lower chamber') parser.add_argument('--bills', action='store_true', dest='bills', default=False, help="scrape bill data") parser.add_argument('--legislators', action='store_true', dest='legislators', default=False, help="scrape legislator data") parser.add_argument('--committees', action='store_true', dest='committees', default=False, help="scrape committee data") parser.add_argument('--votes', action='store_true', dest='votes', default=False, help="scrape vote data") parser.add_argument('--events', action='store_true', dest='events', default=False, help='scrape event data') parser.add_argument('--alldata', action='store_true', dest='alldata', default=False, help="scrape all available types of data") parser.add_argument('--strict', action='store_true', dest='strict', default=False, help="fail immediately when" "encountering validation warning") parser.add_argument('-n', '--no_cache', action='store_true', dest='no_cache', help="don't use web page cache") parser.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm', default=60), args = parser.parse_args() settings.update(args) # set up search path sys.path.insert( 0, os.path.join(os.path.dirname(__file__), '../../openstates')) # get metadata metadata = __import__(args.state, fromlist=['metadata']).metadata state = metadata['abbreviation'] # configure logger if args.verbose == 0: verbosity = logging.WARNING elif args.verbose == 1: verbosity = logging.INFO else: verbosity = logging.DEBUG logging.basicConfig( level=verbosity, format="%(asctime)s %(name)s %(levelname)s " + state + " %(message)s", datefmt="%H:%M:%S", ) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, state) try: os.makedirs(args.output_dir) except OSError, e: if e.errno != 17: raise e
print '%s,"%s"' % (n, category.encode('ascii', 'replace')) if __name__ == '__main__': parser = argparse.ArgumentParser( description='apply subject categorization for bills', parents=[base_arg_parser], conflict_handler='resolve', ) default_dir = os.path.join(os.path.dirname(__file__), '../../manual_data/subjects') parser.add_argument('abbr', type=str, help='abbreviation for data to process') parser.add_argument('--all', help='update all sessions', action='store_true', default=False) parser.add_argument('-d', '--data_dir', help='directory of subject csvs', dest='data_dir', default=default_dir) args = parser.parse_args() settings.update(args) categorize_subjects(args.abbr, args.data_dir, args.all)
def main(old_scrape_compat=False): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group( 'what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group( 'scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') parser.add_argument('--pdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--ipdb', action='store_true', default=False, help='invoke PDB when exception is raised') parser.add_argument('--pudb', action='store_true', default=False, help='invoke PUDB when exception is raised') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ( 'bills', 'legislators', 'committees', 'votes', 'events', 'speeches'): what.add_argument( '--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) scrape.add_argument('--billid', help="scrape only a single bill", action="store", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() if args.pdb or args.pudb or args.ipdb: _debugger = pdb if args.pudb: try: import pudb _debugger = pudb except ImportError: pass if args.ipdb: try: import ipdb _debugger = ipdb except ImportError: pass # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ # if this causes problems in interactive mode check that page def _tb_info(_type, value, tb): traceback.print_exception(_type, value, tb) _debugger.pm() sys.excepthook = _tb_info # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = importlib.import_module(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load module settings, then command line settings settings.update(module_settings) settings.update(args) # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') if 'speeches' in metadata['feature_flags']: args.types.append('speeches') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) _log.info(plan) scrape_data = {} if args.billid is False: _log.debug("No billid filter.") else: _log.debug("Search for billid: %s" % args.billid) if 'scrape' in args.actions: _clear_scraped_data(args.output_dir) # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) _log.debug("Session List %s" % session_list) try: schema_path = os.path.join( os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: _log.warning( 'metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, "state": abbrev } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters if args.billid is False: order = ( 'legislators', 'committees', 'votes', 'bills', 'events', 'speeches') else: _log.debug("going to process bills") order = ('bills',) # only process the bills _traceback = None try: for stype in order: _log.debug("consider to process %s" % stype) if stype in args.types: _log.debug("going to process %s" % stype) scraper_results = _run_scraper(stype, args, metadata) run_record += scraper_results else: _log.debug("skipping %s" % stype) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype}] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['abbr'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: _log.debug("scrape_data:") if scrape_data['failure']: _log.debug("Failed") _log.debug(scrape_data) else: _log.debug("OK") _log.debug(scrape_data) db.billy_runs.save(scrape_data, safe=True) except KeyError as e: _log.debug("Caught exception1 :") _log.debug(e) exit(123) except pymongo.errors.OperationFailure as e: _log.debug("Caught exception3 :") _log.debug(e) exit(123) except Exception as e: _log.debug("Caught exception :") _log.debug(e) exit(123) raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. _log.debug(scrape_data) db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) if 'session-list' in args.actions: if hasattr(module, 'session_list'): print("\n".join(module.session_list())) else: raise ScrapeError('session_list() is not defined') except ScrapeError as e: _log.debug("in update.py Scrape error") _log.debug("Scrape error :%s" % e) _log.critical('Error: %s' % e) sys.exit(1) except TypeError as e: _log.debug("Type error") _log.critical('TypeError:', e) sys.exit(1) except NoData as e: _log.debug("No Data") _log.debug(e) _log.critical('No Data:') sys.exit(1) except NoDoc as e: _log.debug("No Doc") _log.critical('No Doc:', e) sys.exit(1) except NoXpath as e: _log.debug("No XPath") _log.critical('No XPath:', e) sys.exit(1) except Exception as e: _log.debug("Unknown error3") _log.debug(e) _log.critical('Unknown Error') sys.exit(1)
sys.exit(1) else: print "Updating ids for {0}".format(abbr) print "Updating PVS legislator ids..." update_votesmart_legislators(meta) print "Updating TransparencyData ids..." update_transparencydata_legislators(meta, sunlight_key) if __name__ == '__main__': parser = argparse.ArgumentParser( description=('attempt to match legislators with ids in other' 'relevant APIs'), parents=[base_arg_parser], ) parser.add_argument('abbrs', metavar='ABBR', type=str, nargs='+', help='abbreviations for data to update') args = parser.parse_args() settings.update(args) votesmart.apikey = settings.VOTESMART_API_KEY for abbr in args.abbrs: update_missing_ids(abbr, settings.SUNLIGHT_SERVICES_KEY) time.sleep(30)
def main(old_scrape_compat=False): try: parser = argparse.ArgumentParser( description='update billy data', parents=[base_arg_parser], ) what = parser.add_argument_group('what to scrape', 'flags that help select what data to scrape') scrape = parser.add_argument_group('scraper config', 'settings for the scraper') parser.add_argument('module', type=str, help='scraper module (eg. nc)') what.add_argument('-s', '--session', action='append', dest='sessions', default=[], help='session(s) to scrape') what.add_argument('-t', '--term', action='append', dest='terms', help='term(s) to scrape', default=[]) for arg in ('upper', 'lower'): what.add_argument('--' + arg, action='append_const', dest='chambers', const=arg) for arg in ('bills', 'legislators', 'committees', 'votes', 'events'): what.add_argument('--' + arg, action='append_const', dest='types', const=arg) for arg in ('scrape', 'import', 'report'): parser.add_argument('--' + arg, dest='actions', action="append_const", const=arg, help='only run %s step' % arg) # special modes for debugging scrape.add_argument('--nonstrict', action='store_false', dest='strict', default=True, help="don't fail immediately when" " encountering validation warning") scrape.add_argument('--fastmode', help="scrape in fast mode", action="store_true", default=False) # scrapelib overrides scrape.add_argument('-r', '--rpm', action='store', type=int, dest='SCRAPELIB_RPM') scrape.add_argument('--timeout', action='store', type=int, dest='SCRAPELIB_TIMEOUT') scrape.add_argument('--retries', type=int, dest='SCRAPELIB_RETRY_ATTEMPTS') scrape.add_argument('--retry_wait', type=int, dest='SCRAPELIB_RETRY_WAIT_SECONDS') args = parser.parse_args() # inject scraper paths so scraper module can be found for newpath in settings.SCRAPER_PATHS: sys.path.insert(0, newpath) # get metadata module = __import__(args.module) metadata = module.metadata module_settings = getattr(module, 'settings', {}) abbrev = metadata['abbreviation'] # load state settings, then command line settings settings.update(module_settings) settings.update(args) configure_logging(args.module) # configure oyster if settings.ENABLE_OYSTER: from oyster.conf import settings as oyster_settings oyster_settings.DOCUMENT_CLASSES[args.module + ':billtext'] = module.document_class # make output dir args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev) _clear_scraped_data(args.output_dir) # if terms aren't set, use latest if not args.terms: if args.sessions: for session in args.sessions: args.terms.append( term_for_session(metadata['abbreviation'], session, metadata)) args.terms = list(set(args.terms or [])) else: latest_term = metadata['terms'][-1]['name'] args.terms = [latest_term] # only set sessions from terms if sessions weren't set elif not args.sessions: for term in metadata['terms']: if term['name'] in args.terms: args.sessions.extend(term['sessions']) # dedup sessions args.sessions = list(set(args.sessions or [])) if not args.sessions: args.sessions = [metadata['terms'][-1]['sessions'][-1]] # determine chambers if not args.chambers: args.chambers = ['upper', 'lower'] if not args.actions: if old_scrape_compat: args.actions = ['scrape'] else: args.actions = ['scrape', 'import', 'report'] if not args.types: args.types = ['bills', 'legislators', 'votes', 'committees', 'alldata'] if 'events' in metadata['feature_flags']: args.types.append('events') plan = """billy-update abbr=%s actions=%s types=%s sessions=%s terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types), ','.join(args.sessions), ','.join(args.terms)) logging.getLogger('billy').info(plan) scrape_data = {} if 'scrape' in args.actions: # validate then write metadata if hasattr(module, 'session_list'): session_list = module.session_list() else: session_list = [] check_sessions(metadata, session_list) try: schema_path = os.path.join(os.path.split(__file__)[0], '../schemas/metadata.json') schema = json.load(open(schema_path)) validator = DatetimeValidator() validator.validate(metadata, schema) except ValueError as e: logging.getLogger('billy').warning( 'metadata validation error: ' + str(e)) with open(os.path.join(args.output_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f, cls=JSONDateEncoder) run_record = [] exec_record = { "run_record": run_record, "args": sys.argv, "state": abbrev } lex = None exc_traceback = None # start to run scrapers exec_start = dt.datetime.utcnow() # scraper order matters order = ('legislators', 'committees', 'votes', 'bills', 'events') try: for stype in order: if stype in args.types: run_record += _run_scraper(stype, args, metadata) except Exception as e: _traceback = _, _, exc_traceback = sys.exc_info() run_record += [{"exception": e, "type": stype }] lex = e exec_end = dt.datetime.utcnow() exec_record['started'] = exec_start exec_record['ended'] = exec_end scrape_data['scraped'] = exec_record scrape_data['state'] = abbrev for record in run_record: if "exception" in record: ex = record['exception'] fb = traceback.format_exception(*_traceback) trace = "" for t in fb: trace += t record['exception'] = { "type": ex.__class__.__name__, "message": ex.message, 'traceback': trace } scrape_data['failure'] = True if lex: if 'import' in args.actions: try: db.billy_runs.save(scrape_data, safe=True) except Exception: raise lex, None, exc_traceback # XXX: This should *NEVER* happen, but it has # in the past, so we're going to catch any errors # writing # to pymongo, and raise the original # exception rather then let it look like Mongo's fault. # Thanks for catching this, Thom. # # We lose the stack trace, but the Exception is the # same in every other way. # -- paultag raise # imports if 'import' in args.actions: import_report = _do_imports(abbrev, args) scrape_data['imported'] = import_report # We're tying the run-logging into the import stage - since import # already writes to the DB, we might as well throw this in too. db.billy_runs.save(scrape_data, safe=True) # reports if 'report' in args.actions: _do_reports(abbrev, args) except ScrapeError as e: print 'Error:', e sys.exit(1)